-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathOnline.py
309 lines (257 loc) · 13 KB
/
Online.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
##############################################################################
# Online.py - an Online Learning Model for Semi-Structured Text Classification
#
# For: DrivenData.org/ERS's BoxPlots for Education Competition
# By: Quoc Nam Le (quocnle at gmail.com or quoc.le at u.northwestern.edu)
# License: MIT (http://machinelearner.net/boxplots-for-education-1st-place/LICENSE.txt)
#
# This approach is based on tingrtu's Online Learning masterpiece: http://bit.ly/1ItCVcv
# It is customized for specifying flexible interactions between features and
# for decomposing original feature text into "bag of words" tokens.
##############################################################################
from datetime import datetime
from math import log, exp, sqrt
import pickle
import sys
import random
import math
import re
if len(sys.argv) != 3:
print "Usage: pypy Online.py <epochs> <use_example_probability>"
print "epochs is number of passes over the training data"
print "use_example_probability is the probability of using an example in an epoch"
sys.exit(0)
epochs = int(sys.argv[1])
print "Number of Epochs:",epochs
# This is a cheap way to add randomness to the order of training examples
# but use with caution as it does not guarantee all training examples will be seen.
# Use 1 if you want to train in order examples appear in the file
use_example_probability = float(sys.argv[2])
print "Use Example Probability:",use_example_probability
train = 'trainPredictors.csv' # path to training file
label = 'trainLabels.csv' # path to label file of training data
test = 'TestData2.csv' # path to testing file
# Specify which original features to keep and discard in the model
# Intercept = 0
# Object_Description = 1
# Text_2 = 2
# SubFund_Description = 3
# Job_Title_Description = 4
# Text_3 = 5
# Text_4 = 6
# Sub_Object_Description = 7
# Location_Description = 8
# FTE = 9
# Function_Description = 10
# Facility_or_Department = 11
# Position_Extra = 12
# Total = 13
# Program_Description = 14
# Fund_Description = 15
# Text_1 = 16
originals = range(17)
# Found that removing 5 (Text_3) and 7 (Sub Object Description) generaly helped
originals.remove(5)
originals.remove(7)
# Interaction pairs and triples
pairs = [[1,2,3,4],[6,8],[4,12],[1,4,8,10]]
triples = [[1,4,12]]
print 'pairs',pairs
print 'triples',triples
D = 2 ** 18 # number of weights use for each model, we have 104 of them
alpha = .10 # learning rate for sgd optimization
# utilities ############################################
# Used for assigning the number feat to a categorical level 0 to N
# INPUT:
# feat: the numerical predictor
# b: list representing the boundaries for bins
# OUTPUT:
# a categorical level 0 to N
def boundary(feat,b):
f = float(feat)
s = 0
for step in b:
if f < step:
return s
s += 1
return s
# Our hashing function
# INPUT:
# s: the string or number
# OUTPUT:
# an integer between 0 and D-1
def hash_it(s):
return abs(hash(s)) % D
# function, generator definitions ############################################
# A. x, y generator
# This is where:
# * All the feature hashes are generated
# * All feature engineering happens
# INPUT:
# path: path to TrainPredictors.csv or TestData2.csv
# label_path: (optional) path to TrainLabels.csv
# YIELDS:
# ID: id of the instance
# x: list of hashes for predictors
# y: (if label_path is present) binary label
def data(path, label_path=None):
# Boundaries for numerical binning of FTE (9) and Total (13)
b13 = [-706.968,-8.879,
7.85,41.972,
73.798,109.55,
160.786,219.736,
318.619,461.23,
646.73,938.36,
1317.584,2132.933,
3652.662,6659.524,
18551.459,39754.287,
64813.342,129700000]
b9 = [0.0,0.00431,0.131,0.911,1,50]
for t, line in enumerate(open(path)):
# Intcercept term
x = [0]
# Skip headers
if t == 0:
if label_path:
label = open(label_path)
label.readline() # we don't need the headers
continue
# c is an index for the kept original features (15 of them)
# TODO: drop c and use m for hashing, c was kept for reproducibility
# m is the index for all the original features (17 of them)
# feat is the original raw text or value for feature
c =0
for m, feat in enumerate(line.rstrip().split(',')):
# Drop unwanted original features
if m not in originals:
continue
if m == 0:
ID = int(feat)
else:
# convert floats into categorical levels
# variables 9 (FTE) and 13 (Total) are only numericals
if m == 13:
if feat == "": feat = 0
feat = boundary(feat,b13)
if m == 9:
if feat == "": feat = -3
feat = boundary(feat,b9)
# Lowercase and trim so hashes match more often
feat = str(feat).strip().lower()
# First we hash the original feature. For example, if the
# feature is "special education" and the original feature index is 4, we
# hash "4_special education"
original_feature = str(c) + '_' + feat
x.append( hash_it(original_feature) )
# Next we break up the original feature value into word parts
# i.e. create bag-of-word features here
parts = re.split(' |/|-',feat)
for i in range(len(parts)):
token = parts[i].strip().lower()
if token == '': continue
# First we hash each token along with its original position index
# For example, for the feature value "special education" we hash
# its tokens as "4_special" and "4_education" in successive steps of this loop
positioned_word = str(c) + '_' + token
x.append( hash_it( positioned_word ) )
# Next we hash each token by itself, ignoring any information about its position
# For example, for "special education" we hash "special" and "education"
# regardless of what index position the original feature appeared in.
# This views all the feature values in an example as making up a single document
x.append( hash_it( token ) )
c = c + 1
# Up to this point we've been breaking original features down into smaller features
# Now we level up and compose original features with each other into larger interction features
row = line.rstrip().split(',')
# Start with pairs. Make pairs from interaction groups defined in pairs variable.
for interactions in pairs:
for i in xrange(len(interactions)):
for j in xrange(i+1,len(interactions)):
pair = row[interactions[i]]+"_x_"+row[interactions[j]]
x.append( hash_it(pair) )
# Do the same thing for triples
for triple in triples:
trip = row[triple[0]]+"_x_"+row[triple[1]] + '_x_' +row[triple[2]]
x.append( hash_it(trip) )
if label_path:
y = [float(y) for y in label.readline().split(',')[1:]]
yield (ID, x, y) if label_path else (ID, x)
# B. Bounded logloss
# INPUT:
# p: our prediction
# y: real answer
# OUTPUT
# bounded logarithmic loss of p given y
def logloss(p, y):
p = max(min(p, 1. - 10e-15), 10e-15)
return -log(p) if y == 1. else -log(1. - p)
# C. Get probability estimation on x
# INPUT:
# x: features
# w: weights
# OUTPUT:
# probability of p(y = 1 | x; w)
def predict(x, w):
wTx = 0.
for i in x: # do wTx
wTx += w[i] * 1. # w[i] * x[i], but if i in x we got x[i] = 1.
return 1. / (1. + exp(-max(min(wTx, 100.), -100.))) # bounded sigmoid
# D. Update given model
# INPUT:
# alpha: learning rate
# w: weights
# n: sum of previous absolute gradients for a given feature
# this is used for adaptive learning rate
# x: feature, a list of indices
# p: prediction of our model
# y: answer
# MODIFIES:
# w: weights
# n: sum of past absolute gradients
def update(alpha, w, n, x, p, y,k):
for i in x:
# alpha / sqrt(n) is the adaptive learning rate
# (p - y) * x[i] is the current gradient
# note that in our case, if i in x then x[i] = 1.
n[i] += abs(p - y)
w[i] = w[i] - ((p - y) * 1. ) * alpha / n[i] ** 0.5
# training and testing #######################################################
start = datetime.now()
# Number of models.
DIM = 104
K = range(DIM)
w = [[0.] * D for k in range(DIM)]
n = [[0.] * D for k in range(DIM)]
random.seed(1234)
loss = 0.
rec = 0
for i in range(epochs):
for ID, x, y in data(train,label):
# Randomly choose whether or not to train with this example in this epoch
if random.random() > use_example_probability: continue
# record counter
rec += 1
# get predictions and train on all labels
for k in K:
p = predict(x, w[k])
update(alpha, w[k], n[k], x, p, y[k],k)
loss += logloss(p, y[k]) # for progressive validation
# print out progress, so that we know everything is working
if rec % 50000 == 0:
print('%s\tencountered: %d\tcurrent logloss: %f' % (
datetime.now(), rec, (loss/float(DIM))/rec))
h = ',Function__Aides Compensation,Function__Career & Academic Counseling,Function__Communications,Function__Curriculum Development,Function__Data Processing & Information Services,Function__Development & Fundraising,Function__Enrichment,Function__Extended Time & Tutoring,Function__Facilities & Maintenance,Function__Facilities Planning,"Function__Finance, Budget, Purchasing & Distribution",Function__Food Services,Function__Governance,Function__Human Resources,Function__Instructional Materials & Supplies,Function__Insurance,Function__Legal,Function__Library & Media,Function__NO_LABEL,Function__Other Compensation,Function__Other Non-Compensation,Function__Parent & Community Relations,Function__Physical Health & Services,Function__Professional Development,Function__Recruitment,Function__Research & Accountability,Function__School Administration,Function__School Supervision,Function__Security & Safety,Function__Social & Emotional,Function__Special Population Program Management & Support,Function__Student Assignment,Function__Student Transportation,Function__Substitute Compensation,Function__Teacher Compensation,Function__Untracked Budget Set-Aside,Function__Utilities,Object_Type__Base Salary/Compensation,Object_Type__Benefits,Object_Type__Contracted Services,Object_Type__Equipment & Equipment Lease,Object_Type__NO_LABEL,Object_Type__Other Compensation/Stipend,Object_Type__Other Non-Compensation,Object_Type__Rent/Utilities,Object_Type__Substitute Compensation,Object_Type__Supplies/Materials,Object_Type__Travel & Conferences,Operating_Status__Non-Operating,"Operating_Status__Operating, Not PreK-12",Operating_Status__PreK-12 Operating,Position_Type__(Exec) Director,Position_Type__Area Officers,Position_Type__Club Advisor/Coach,Position_Type__Coordinator/Manager,Position_Type__Custodian,Position_Type__Guidance Counselor,Position_Type__Instructional Coach,Position_Type__Librarian,Position_Type__NO_LABEL,Position_Type__Non-Position,Position_Type__Nurse,Position_Type__Nurse Aide,Position_Type__Occupational Therapist,Position_Type__Other,Position_Type__Physical Therapist,Position_Type__Principal,Position_Type__Psychologist,Position_Type__School Monitor/Security,Position_Type__Sec/Clerk/Other Admin,Position_Type__Social Worker,Position_Type__Speech Therapist,Position_Type__Substitute,Position_Type__TA,Position_Type__Teacher,Position_Type__Vice Principal,Pre_K__NO_LABEL,Pre_K__Non PreK,Pre_K__PreK,Reporting__NO_LABEL,Reporting__Non-School,Reporting__School,Sharing__Leadership & Management,Sharing__NO_LABEL,Sharing__School Reported,Sharing__School on Central Budgets,Sharing__Shared Services,Student_Type__Alternative,Student_Type__At Risk,Student_Type__ELL,Student_Type__Gifted,Student_Type__NO_LABEL,Student_Type__Poverty,Student_Type__PreK,Student_Type__Special Education,Student_Type__Unspecified,Use__Business Services,Use__ISPD,Use__Instruction,Use__Leadership,Use__NO_LABEL,Use__O&M,Use__Pupil Services & Enrichment,Use__Untracked Budget Set-Aside'
# write out weights
print('writing weights to file')
with open('weights.pkl', 'w') as f:
pickle.dump(w, f)
output = './submission1234.csv'
with open(output, 'w') as outfile:
outfile.write(h + '\n')
for ID, x in data(test):
outfile.write(str(ID))
for k in K:
p = predict(x, w[k])
outfile.write(',%s' % str(p))
outfile.write('\n')
print('Done, elapsed time: %s' % str(datetime.now() - start))