-
Notifications
You must be signed in to change notification settings - Fork 4
/
train.py
191 lines (138 loc) · 5.74 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import sqlite3, itertools
import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib
from sklearn import preprocessing
import build_features as feat
#FLAG_BUILD_DECOY_LIST = True
FLAG_BUILD_DECOY_LIST = False
# When training, how big of a decoy set to use
_DECOY_PROPORTION = 5
conn_decoy = feat.conn_decoy
conn_train = feat.conn_train
f_clf = "db/clf.joblib"
f_norm_scale = "db/scale.joblib"
total_samples = feat.count_WIKI_corpus()
def query_skip_decoys():
cmd_select = '''SELECT tokens,weights FROM decoy AS A
JOIN skip_decoy_query AS B ON A.decoy_idx=B.decoy_idx
'''
cursor = conn_decoy.execute(cmd_select)
for text,weight_str in cursor:
# Load the entropy weight
w = np.fromstring(weight_str[1:-1],sep=',')
yield text, w
def build_skip_query(skip_n):
cmd_template = '''
DROP TABLE IF EXISTS skip_decoy_query;
CREATE TABLE skip_decoy_query (decoy_idx INTEGER PRIMARY KEY);'''
conn_decoy.executescript(cmd_template)
cmd_find = '''
SELECT decoy_idx, wikipedia_title
FROM decoy WHERE decoy_idx%{}==0'''.format(skip_n)
cmd_insert = '''INSERT INTO skip_decoy_query (decoy_idx)
VALUES (?)'''
cursor = conn_decoy.execute(cmd_find)
for idx,title in cursor:
print idx, title
conn_decoy.execute(cmd_insert, (idx,))
conn_decoy.commit()
def getWordVecs(text,weight,features,dimension):
# The vector is the entropy-weighted average of each word
vec = np.zeros(dimension).reshape((1, dimension))
tokens = text.split()
count = 0.0
for single_entropy,word in zip(weight,tokens):
if word in features:
vec += single_entropy*features[word]
count += single_entropy
vec /= count
if np.isnan(vec).any():
vec[:] = 1.0
return vec
def train_model():
TIL_n = feat.count_TIL_corpus()
decoy_n = TIL_n*_DECOY_PROPORTION
FP_n = feat.count_TIL_false_pos()
wiki_n = feat.count_WIKI_corpus()
skip_wiki_n = wiki_n // decoy_n
# Keep the number of false positives in about the same Order-of-Mag
skip_FP = FP_n // TIL_n
print "Skipping every {} value in FP".format(skip_FP)
if FLAG_BUILD_DECOY_LIST:
build_skip_query(skip_wiki_n)
print "Loading features"
features = Word2Vec.load(feat.f_features)
dimension = 100 # default dimension
ITR_decoy = query_skip_decoys()
print "Building training set"
ITR_train = list(feat.TIL_full_corpus_iter())
print "Building the false positive set"
ITR_FP = list(feat.TIL_false_pos_iter(skip_FP))
print "Building corpus iter"
ITR = feat.chainer(ITR_train, ITR_FP, ITR_decoy)
ITR = list(ITR)
Y = np.zeros(len(ITR))
Y[:TIL_n] = 1.0
TTS = train_test_split
x_train, x_test, y_train, y_test = TTS(ITR, Y, test_size=0.2)
print "Proportion of answers {}/{}".format(y_train.sum(),
y_test.sum())
print "Calculating the wordVecs for train"
vec_train = np.concatenate([getWordVecs(text,weight,
features,dimension)
for text,weight in x_train])
print "Building the scalar"
scaler = preprocessing.StandardScaler().fit(vec_train)
print "Saving the scaler"
joblib.dump(scaler, f_norm_scale)
print "Scaling train vectors"
vec_train = scaler.transform(vec_train)
print "Calculating the wordVecs for test"
vec_test = np.concatenate([getWordVecs(text,weight,features,dimension)
for text,weight in x_test])
print "Scaling test vectors"
vec_test = scaler.transform(vec_test)
print "Train size/TP in sample", vec_train.shape, (y_train==1).sum()
print "Test size/TP in sample", vec_test.shape, (y_test==1).sum()
print "Training classifer"
#from sklearn.linear_model import SGDClassifier as Classifier
#from sklearn.linear_model import LogisticRegression as Classifier
#from sklearn.linear_model import BayesianRidge as Classifier
#from sklearn.naive_bayes import BernoulliNB as Classifier
#from sklearn.naive_bayes import GaussianNB as Classifier
#from sklearn.naive_bayes import GaussianNB as Classifier
#from sklearn.ensemble import RandomForestClassifier as Classifier
from sklearn.ensemble import ExtraTreesClassifier as Classifier
# This seems to be the best... but high FP rate
#from sklearn.naive_bayes import BernoulliNB as Classifier
#clf = Classifier(loss='log', penalty='l1',verbose=2) # SGD
#clf = Classifier(C=2500,verbose=2) # LogisiticRegression
#clf = Classifier() # Naive Bayes
clf = Classifier(n_estimators=200,n_jobs=8) # ExtraTrees
clf.fit(vec_train, y_train)
print 'Test Accuracy: %.3f'%clf.score(vec_test, y_test)
idx_TP = np.array(y_test) > 0
vec_TP = np.array(vec_test)[idx_TP]
y_TP = np.array(y_test)[idx_TP]
print 'Test Accuracy on TP: %.3f'%clf.score(vec_TP, y_TP)
vec_FP = np.array(vec_test)[~idx_TP]
y_FP = np.array(y_test)[~idx_TP]
print 'Test Accuracy on FP: %.3f'%clf.score(vec_FP, y_FP)
print "Saving the classifer"
joblib.dump(clf, f_clf)
#Create ROC curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
pred_probas = clf.predict_proba(vec_test)[:,1]
fpr,tpr,_ = roc_curve(y_test, pred_probas)
roc_auc = auc(fpr,tpr)
plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc='lower right')
plt.show()
if __name__ == "__main__":
train_model()