-
Notifications
You must be signed in to change notification settings - Fork 0
/
KR_A1.py
484 lines (345 loc) · 14.1 KB
/
KR_A1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
#AUTHOR: ARYA POURTABATABAIE
from re import findall
import random
from math import log
import numpy as np
#enums
#This class represents categorical attributes. It is assumed that they can either of the 4 possible values
class CatAttr:
#Common names for values
low= 1
medl= 2
medh= 3
high= 4
#List of legal values, for use in loops
vals=[1,2,3,4]
#Name of legal values in order, for use in printing
valNames=["low", "medl", "medh", "high"]
def __init__(self, val):
self.val= int(val)
#For use in printing
def __str__(self):
return CatAttr.valNames[self.val-1]
#This is the target categorical attribute. It hold either of three legal values
class Contraception:
No= 1
Long= 2
Short= 3
vals=[1,2,3]
valNames=["No", "Long", "Short"]
def __init__(self, val):
self.val= int(val)
def __str__(self):
return Contraception.valNames[self.val-1]
#A list of two possible booleans for convenience
BoolVals=[False, True]
#the record class
class Record:
#Number of attributes of each type
binaryCount=3
numericalCount=2
categoricalCount=4
#Accessing the values of the attributes by their name
def wAge(self):
return self.numericals[0]
def nChild(self):
return self.numericals[1]
def wEd(self):
return self.categoricals[0]
def hEd(self):
return self.categoricals[1]
def hOcc(self):
return self.categoricals[2]
def livStd(self):
return self.categoricals[3]
def wMuslim(self):
return self.binaries[0]
def wWorking(self):
return self.binaries[1]
def goodMedia(self):
return self.binaries[2]
def contraception(self):
return self.tgt
#The constructor
def __init__(self,wAge,wEd,hEd,nChild,wMuslim, wWorking, hOcc, livStd, goodMedia, contraception):
self.numericals= [wAge, nChild]
self.categoricals= [wEd, hEd, hOcc, livStd]
self.binaries= [wMuslim, wWorking, goodMedia]
self.tgt= contraception
#For use in printing
def __str__(self):
return str(self.wAge())+' '+ str(self.wEd())+' '+str(self.hEd())+' '+str(self.nChild())+' '+str(self.wMuslim())+' '+str(self.wWorking())+' '+str(self.hOcc())+' '+str(self.livStd())+' '+str(self.goodMedia())+' '+str(self.contraception())
#MAIN ROUTINE
#READ DATA FROM FILE
f= open("cmc.data", "rt")
fs= f.readline()
datal= list()
while not fs==None and not fs=="":
fl= findall(r'\d+', fs) #Extract the numbers (Non-empty strings of digits)
#Create a new record based on the input
datal.append(Record(int(fl[0]), CatAttr(int(fl[1])), CatAttr(fl[2]), int(fl[3]), BoolVals[int(fl[4])], BoolVals[int(fl[5])], CatAttr(fl[6]), CatAttr(fl[7]), BoolVals[int(fl[8])], Contraception(fl[9]) ) )
fs=f.readline()
f.close()
#COMPUTING ENTROPY OF A DATA SET
def Entropy(data):
if (len(data)==0): #An empty set is deterministic!
return .0
tgtVals= [r.tgt.val for r in data] #Target values
entropy=0.0
count=0
dataCountInv= 1.0/len(data) #Because multiplication is faster than division!
for tgtVal in Contraception.vals: #for any POSSIBLE value
count = tgtVals.count(tgtVal) #see how many of those are there
if (count>=1):
frac= count * dataCountInv
entropy= entropy + log(frac,2)*frac
return -entropy
#Decision Nodes
class Node:
#Indexes for how types of attributes are represented
categorical=0
binary=1
numerical=2
AttrTypeName=["categorical", "binary", "numerical"]
#Number of the nodes generated
nodeNum=0
def __init__(self, level, data):
self.level=level #Depth of the node
self.data=data #It's data set
self.children= [] #Children
self.nodeNum= Node.nodeNum #Index of the node -- for debugging purporses
Node.nodeNum= Node.nodeNum+1
self.SetIsLeaf() #To see if it is going to be a leaf node
def SetIsLeaf(self):
self.isLeaf = (self.level == 20) or (len(self.data)==3) or (Entropy(self.data)<.1 )
#Too deep of a tree, few data points, and low variation among data points are all good criteria for termination
#Classify a data point based on the tree generated
def Classify(self,r):
if self.isLeaf:
return self.finalLabel
elif self.dividingAttrT== Node.binary:
if r.binaries[self.dividingAttrI]:
i=1 # 1 = True
else:
i=0 # 0 = False
elif self.dividingAttrT== Node.numerical:
if r.numericals[self.dividingAttrI]< self.dividingThresh:
i=0 # 0 = lower than threshold
else:
i=1 # 1 = higher than threshold
elif self.dividingAttrT== Node.categorical:
i=r.categoricals[self.dividingAttrI].val - CatAttr.low
return self.children[i].Classify(r) #Found the right child, now classify recursively
#Come up with a definite label decision if it is a leaf node
def SetFinalLabel(self): #Leaf node, take the label that has majority
if (len(self.data)==0):
self.finalLabel= random.randint(1,3) #No other criteria!
else:
tgts= [ r.tgt.val for r in self.data] #Find the mode, that appears most
counts= np.array([ tgts.count(v) for v in Contraception.vals])
self.finalLabel= np.argmax(counts) + Contraception.No
#Generate the subtree of the node
def CreateTree(self):
if self.isLeaf:
self.SetFinalLabel()
else: #if it is not a leaf node
self.SetBestSplitAttr() #Figure out how the splitting can be done best
if (self.dividingAttrT== Node.numerical): #Divide the data according to the regression threshold
dataDivisions= self.DivideNumerical(self.dividingAttrI, self.dividingThresh)
else: #Divide it according to the values of the categorical/binary attribute
dataDivisions= self.DivideNonNumerical(self.dividingAttrT, self.dividingAttrI)
#For every data division, a child node should be managing it
self.children= [ Node(self.level+1, d) for d in dataDivisions]
#Recursively create the tree
for ch in self.children:
ch.CreateTree()
#self.LogCreation()
def LogData(self):
for r in self.data:
print r
def LogCreation(self):
print "Node",self.nodeNum
if self.isLeaf:
print "Node leaf with data entropy=",Entropy(self.data),", no more branching"
print "Always predict", self.finalLabel
else:
print "dividing by", Node.AttrTypeName[self.dividingAttrT], "no", self.dividingAttrI
if self.dividingAttrT== Node.numerical:
print "threshold=", self.dividingThresh
print "\tChildren",
for ch in self.children:
print "Node", ch.nodeNum,
print ""
#Divide the training dataset for the children of the node based on the division criterion chosen
def DivideNonNumerical(self, dividingAttrT, dividingAttrI): #Parameters: Attribute Type (Binary, Categorical), the Index of the attribute among its own type
if (dividingAttrT== Node.binary):
vals= np.array([r.binaries[dividingAttrI] for r in self.data]) #Array of the corresponding values of the records
possibleVals= [False,True]
elif (dividingAttrT== Node.categorical):
vals= np.array([r.categoricals[dividingAttrI].val for r in self.data])
possibleVals= CatAttr.vals
valI=[ np.where(vals==val) for val in possibleVals] #for each possible value, make out the indexes of where in the original dataset they reside
return [ self.data[i[0] ] for i in valI] #Now use those indexes to partition the dataset
#Dividing the training dataset... If the dividing attribute is numerical
def DivideNumerical(self, dividingAttrI, thresh): #Arguments: Attribute index among the numerical attributes, the threshold resulted from regression
vals= np.array([r.numericals[dividingAttrI] for r in self.data])
valI=[ np.where( (vals>=thresh) == truth) for truth in [False, True] ] #indexes of those lower than the threshold and those higher
return [self.data[i[0] ] for i in valI]
#Compute the entropy IF WE WERE TO divide the dataset based on this particular attribute
def SplitEntropyNonNumerical(self, dividingAttrT, dividingAttrI):
return self.SplitEntropy(self.DivideNonNumerical(dividingAttrT, dividingAttrI))
#The entropy of a single branch times its weight, should be summed along all branches
def SplitEntropy(self,dats): #Dats is a list of lists
sizeInv= 1.0/ len(self.data)
return sum([ sizeInv * Entropy(dat) * len(dat) for dat in dats])
def SetBestSplitAttr(self):
#CATEGORICAL SPLIT... JUST REMEMBER Node.categorical=0
split= np.array([ self.SplitEntropy(self.DivideNonNumerical(Node.categorical, i)) for i in xrange(Record.categoricalCount)])
bestI= np.array([np.argmin(split), 0, 0]) #The best categorical split
best= np.array( [split[bestI[0] ], 0, 22220])
#BINARY SPLITc
split= np.array([ self.SplitEntropy(self.DivideNonNumerical(Node.binary, i)) for i in xrange(Record.binaryCount)])
bestI[Node.binary]= np.argmin(split) #The best binary split
best[Node.binary]= split[bestI[Node.binary]]
#NUMERICAL SPLIT
regressionThreshes= [ [ self.RegressionThresh(i, t) for t in Contraception.vals ] for i in xrange(Record.numericalCount)] #The optimal threshold for a given numerical attribute and a given target values it wants to isolate
split= np.array([ np.array([ self.SplitEntropy(self.DivideNumerical(i,regressionThreshes[i][t- Contraception.No])) for t in Contraception.vals ]) for i in xrange(Record.numericalCount)])
BestTgtIforCatI= np.array( [ np.argmin(split[i]) for i in xrange(Record.numericalCount)]) #For each attribute, the best target's index
BestforCatI= np.array( [split[i][BestTgtIforCatI[i]] for i in xrange(Record.numericalCount)]) #For each attribute, the best entropy
bestI[Node.numerical] = np.argmin(BestforCatI)
best[Node.numerical]= BestforCatI[bestI[Node.numerical]]
self.dividingThresh= regressionThreshes[bestI[Node.numerical]][BestTgtIforCatI[bestI[Node.numerical]]]
self.dividingTgt= BestTgtIforCatI[bestI[Node.numerical]]
bestofbest= np.argmin(best)
self.dividingAttrI= bestI[bestofbest]
self.dividingAttrT= bestofbest
#Threshold of regression with respect to a particular attribute and a particular target value to isolate
def RegressionThresh(self, dividingAttrI, tgtToIsolate):
vals= [ (r.numericals[dividingAttrI], 1 if r.tgt.val==tgtToIsolate else -1) for r in self.data] # (x,y) pairs
xs= [val[0] for val in vals] #x values
ys= [val[1] for val in vals] #y values
xSum= sum(xs)
ySum= sum(ys)
xySum= sum([val[0]*val[1] for val in vals])
if xySum==.0:
wInv=999999 #A very large number,just to avoid division by zero
else:
wInv= float(sum([x*x for x in xs]) - xSum*xSum) / (xySum - xSum* ySum)
nInv= 1.0/len(self.data)
v=(xSum- wInv*ySum)*nInv
if (min(xs) <= v and max(xs) >= v): #if it's in range return the threshold computed
return v
else:
xs.sort()
return xs[len(xs)/2] #the median
#STATS
def Accuracy(confMat, dataSizeInv):
return dataSizeInv * sum([confMat[i][i] for i in xrange(len(Contraception.vals))])
def Recall(confMat):
return [float( confMat[i][i])/ sum( [confMat[i][j] for j in xrange(len(Contraception.vals))] ) for i in xrange(len(Contraception.vals))]
def Precision(confMat):
return [float( confMat[i][i])/ sum( [confMat[j][i] for j in xrange(len(Contraception.vals))] ) for i in xrange(len(Contraception.vals))]
def PrintMetricList(metric):
for m,name in zip(metric,Contraception.valNames):
print "\tfor", name, ":", m
def mean(l):
return sum(l)/len(l)
def variance(l):
return (sum([x*x for x in l]) - mean(l) )/len(l)
def MeanOfEach(lB):
return [ mean([lB[run][attr] for run in xrange(len(lB))] ) for attr in xrange(len(lB[0])) ]
return [mean(l) for l in lB]
def VarianceOfEach(lB):
return [ variance([lB[run][attr] for run in xrange(len(lB))] ) for attr in xrange(len(lB[0])) ]
return [variance(l) for l in lB]
#CREATE test and training data sets
k= 10
random.seed()
testSizeInv= 1.0/(len(datal)/k)
dataSizeInv= 1.0/len(datal)
#Statistics of the metrics
taccuracy=[]
tprecision=[]
trecall=[]
tf1=[]
daccuracy=[]
dprecision=[]
drecall=[]
df1=[]
for run in xrange(k): #Run k times
random.shuffle(datal)
test= np.array(datal[:len(datal)/k])
train= np.array(datal[len(datal)/k :])
data = np.array(datal) #THIS SHOULD NOT BE REMOVED
root= Node(0,train)
root.CreateTree()
confusionMat= [ [0 for v in Contraception.vals] for w in Contraception.vals] # i= actual, j= predicted over test data only
for r in test:
prediction=root.Classify(r)
confusionMat[r.tgt.val-1][prediction-1]+=1
confusionWhole= [ [0 for v in Contraception.vals] for w in Contraception.vals] # i= actual, j= predicted over all data
for r in data:
prediction=root.Classify(r)
confusionWhole[r.tgt.val-1][prediction-1]+=1
accuracy=Accuracy(confusionMat, testSizeInv)
precision=Precision(confusionMat)
recall=Recall(confusionMat)
f1= [ (2*p*r)/(p+r) for p,r in zip(precision, recall)]
taccuracy.append(accuracy)
tprecision.append(precision)
trecall.append(recall)
tf1.append(f1)
print "Run",run,"metrics:"
print "TEST DATA"
print "Accuracy=",accuracy
print "Precision:"
PrintMetricList(precision)
print "Recall:"
PrintMetricList(recall)
print "F1:"
PrintMetricList(f1)
accuracy=Accuracy(confusionWhole, dataSizeInv)
precision=Precision(confusionWhole)
recall=Recall(confusionWhole)
f1= [ (2*p*r)/(p+r) for p,r in zip(precision, recall)]
daccuracy.append(accuracy)
dprecision.append(precision)
drecall.append(recall)
df1.append(f1)
print "Run",run,"metrics:"
print "WHOLE DATA"
print "Accuracy=",accuracy
print "Precision:"
PrintMetricList(precision)
print "Recall:"
PrintMetricList(recall)
print "F1:"
PrintMetricList(f1)
print "Final Statistics over test data:"
print "Accuracy: mean=", mean(taccuracy), "variance=", variance(taccuracy)
print "Precision mean:"
PrintMetricList(MeanOfEach(tprecision))
print "Precision variance:"
PrintMetricList(VarianceOfEach(tprecision))
print "Recall mean:"
PrintMetricList(MeanOfEach(trecall))
print "Recall variance:"
PrintMetricList(VarianceOfEach(trecall))
print "F1 mean:"
PrintMetricList(MeanOfEach(tf1))
print "F1 variance:"
PrintMetricList(VarianceOfEach(tf1))
print "Final Statistics over whole data:"
print "Accuracy: mean=", mean(daccuracy), "variance=", variance(daccuracy)
print "Precision mean:"
PrintMetricList(MeanOfEach(dprecision))
print "Precision variance:"
PrintMetricList(VarianceOfEach(dprecision))
print "Recall mean:"
PrintMetricList(MeanOfEach(drecall))
print "Recall variance:"
PrintMetricList(VarianceOfEach(drecall))
print "F1 mean:"
PrintMetricList(MeanOfEach(df1))
print "F1 variance:"
PrintMetricList(VarianceOfEach(df1))