-
Notifications
You must be signed in to change notification settings - Fork 4
/
score.py
132 lines (97 loc) · 2.95 KB
/
score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import sqlite3, itertools
import numpy as np
from sklearn.externals import joblib
from gensim.models.word2vec import Word2Vec
import build_features as feat
conn_decoy = sqlite3.connect("db/decoy.db")
f_features = "db/features.word2vec"
f_clf = "db/clf.joblib"
f_norm_scale = "db/scale.joblib"
scalar = joblib.load(f_norm_scale)
clf = joblib.load(f_clf)
print "Loading model"
features = Word2Vec.load(f_features)
dimension = 100
# Create the score table
cmd_template = '''
DROP TABLE IF EXISTS score;
CREATE TABLE IF NOT EXISTS score (
idx INTEGER PRIMARY KEY NOT NULL,
score FLOAT DEFAULT NULL
);
INSERT INTO score (idx)
SELECT decoy_idx FROM decoy;
'''
#print "Templating"
#conn_decoy.executescript(cmd_template)
total_samples = feat.count_WIKI_corpus()
cmd_check_remaining = '''SELECT COUNT(*) FROM score WHERE score IS NULL'''
score_remaining = conn_decoy.execute(cmd_check_remaining).next()
print "There are {} left to score".format(score_remaining)
def getWordVecs(text,weight,model,dimension):
# The vector is the entropy-weighted average of each word
vec = np.zeros(dimension).reshape((1, dimension))
tokens = text.split()
count = 0.0
for single_entropy,word in zip(weight,tokens):
if word in model:
vec += single_entropy*model[word]
count += single_entropy
vec /= count
if np.isnan(vec).any():
vec[:] = 1.0
return vec
def decoy_vec_iter():
# Select the values we haven't seen before
cmd_select = '''
SELECT A.decoy_idx, A.tokens,A.weights FROM decoy AS A
JOIN score AS B ON A.decoy_idx=B.idx
WHERE B.score IS NULL
'''
cursor = conn_decoy.execute(cmd_select)
for idx,text,w_str in cursor:
w = np.fromstring(w_str[1:-1],sep=',')
vec = getWordVecs(text,w,features,dimension)
yield idx, scalar.transform(vec)
def grouper(ITR, size):
data = []
for item in ITR:
data.append(item)
if len(data)==size:
yield data
data = []
yield data
def predict_block(item):
IDX, VEC = zip(*item)
VEC = np.vstack(VEC)
score = clf.predict(VEC).astype(int)
return score,IDX
cmd_update = '''UPDATE score
SET score=?
WHERE idx=?
'''
DATA = grouper(decoy_vec_iter(), 10000)
ITR = itertools.imap(predict_block, DATA)
#import multiprocessing
#P = multiprocessing.Pool(4)
#ITR = P.imap(predict_block, DATA)
for k,(score,IDX) in enumerate(ITR):
print k, sum(score)
conn_decoy.executemany(cmd_update, zip(score,IDX))
if k and k%10==0:
print "Commiting"
conn_decoy.commit()
conn_decoy.commit()
'''
def result_ITR():
print "Starting prediction"
for k,(idx,vec) in enumerate(ITR):
score = clf.predict(vec)[0]
if score:
print k, idx, score
# Round to int
score = int(score)
yield score,idx
conn_decoy.executemany(cmd_update, result_ITR())
conn_decoy.commit()
'''