-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyearly_review.py
106 lines (79 loc) · 3.47 KB
/
yearly_review.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
def identity_tokenizer(text):
return text
def print_results(preds, y_vals, title):
accuracy = accuracy_score(y_vals, preds)
print("===" + title + "===")
print(f"Confusion Matrix:\n{confusion_matrix(y_vals, preds)}"
f"\nAccuracy:{accuracy}")
print("=== END OF" + title + "===\n\n")
return accuracy
def yearly_review():
print('In Yearly')
results = []
years = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
for year in years:
df1 = pd.read_csv(normalised_csv, index_col=0)
df2 = pd.read_csv('Years/' + str(year) + '.csv')
# remove current year from the dataset for training, use only the curr year as test data
pd.concat([df1, df2, df2]).drop_duplicates(keep=False)
df1["text"] = df1["text"].apply(eval)
df2["text"] = df2["text"].apply(eval)
big_features = df1.drop(columns='class')
big_y = df1['class'].to_numpy()
print(big_y.shape)
year_features = df2.drop(columns='class')
year_y = df2['class'].to_numpy()
print(year_y.shape)
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, ngram_range=(1, 2), lowercase=False)
tfidf_text = tfidf.fit_transform(big_features['text'])
big_ef = big_features.drop(columns='text')
big_X = sparse.hstack([big_ef, tfidf_text]).tocsr()
tfidf_text = tfidf.transform(year_features['text'])
little_ef = year_features.drop(columns='text')
year_X = sparse.hstack([little_ef, tfidf_text]).tocsr()
ch2 = SelectKBest(chi2, k=10000)
big_X = ch2.fit_transform(big_X, big_y)
year_X = ch2.fit_transform(year_X, year_y)
log_clf = LogisticRegression(C=10, class_weight='balanced', solver='liblinear')
log_clf.fit(big_X, big_y)
preds = log_clf.predict(year_X)
year_accuracy = print_results(preds, year_y, str(year))
results.append(year_accuracy)
plt.plot(years, results, 'g^', linestyle='--')
plt.xlabel('year')
plt.ylabel('accuracy')
plt.title('Clickbait Yearly Analysis')
plt.show()
plt.rcParams['figure.figsize']=(20,10) # set the figure size
fig, ax1 = plt.subplots()
ax1.bar(years[0], results[0], alpha=0.4)
ax1.bar(years[1], results[1], alpha=0.4)
ax1.bar(years[2], results[2], alpha=0.4)
ax1.bar(years[3], results[3], alpha=0.4)
ax1.bar(years[4], results[4], alpha=0.4)
ax1.bar(years[5], results[5], alpha=0.4)
ax1.bar(years[6], results[6], alpha=0.4)
ax1.bar(years[7], results[7], alpha=0.4)
ax1.bar(years[8], results[8], alpha=0.4)
ax1.bar(years[9], results[9], alpha=0.4)
ax1.bar(years[10], results[10], alpha=0.4)
ax1.plot(results)
ax1.set_ylim([.9,.96])
ax1.grid(b=False, which='minor')
ax1.set_title('Clickbait Yearly Analysis')
ax1.set_ylabel('Accuracy')
ax1.set_xlabel('Year')
labels = ['2010', '2011', '2012','2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
ax1.axes.set_xticklabels(labels)
plt.show()
#################################
normalised_csv = "data/normalised.csv"
yearly_review()