-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathensemblePerUserV2.py
120 lines (97 loc) · 3.86 KB
/
ensemblePerUserV2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
__author__ = 'vittorioselo'
def ensemblePerUser(rank):
import pandas
import numpy
from sklearn import svm
from sklearn import metrics
import os
from os.path import join, isfile
from collections import defaultdict
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from errorAnalysis import meanError,setError
listUsers = list()
myPath = 'trainPerRank/'
listUsers = [f for f in os.listdir(str(myPath)) if isfile(join(myPath, f))]
#listUsers.remove('.DS_Store')
dictResults = defaultdict(float)
averageError = float()
errorSet = int()
i=0
for user in listUsers:
print(i)
i+=1
#======READING TRAIN SET========
dataTrain = numpy.array(pandas.read_csv('trainPerRank/'+user, header=None))
trainRank = list(map(lambda x:x[0],numpy.array(pandas.read_csv('trainPerRank/'+rank+'/stars/'+user, header=None))))
#=========READING VALIDATION SET =========
dataValidation = numpy.array(pandas.read_csv('validationPerRank/'+user, header=None))
validationRank = list(map(lambda x:x[0],numpy.array(pandas.read_csv('validationPerRank/'+rank+'/stars/'+user, header=None))))
#============READING TEST SET ==========
dataTest = numpy.array(pandas.read_csv('testPerRank/'+user, header=None))
testRank = list(map(lambda x:x[0],numpy.array(pandas.read_csv('testPerRank/'+rank+'/stars/'+user, header=None))))
#======SVM=========
clf1 = svm.SVC()#RBF
clf1.decision_function_shape = 'ovr'
clf1.fit(dataTrain, trainRank)
clf2 = svm.SVC() #LINEAR
clf2.decision_function_shape ='ovr'
clf2.kernel = 'linear'
clf2.fit(dataTrain, trainRank)
#==========MAX ENT ==========
logreg = linear_model.LogisticRegression()
logreg.solver = 'lbfgs'
logreg.class_weight = 'balanced'
logreg.multi_class = 'ovr'
logreg.fit(dataTrain, trainRank)
#========RANDOM FOREST ==========
#forest = RandomForestClassifier(n_estimators=400)
#forest.fit(dataTrain, trainRank)
#========CHOOSING PREDICTOR BASE ON VALDATION SET=========
pre1 = clf1.predict(dataValidation)
pre2 = clf2.predict(dataValidation)
pre3 = logreg.predict(dataValidation)
#pre4 = forest.predict(dataValidation)
acc1 = metrics.accuracy_score(validationRank, pre1)
acc2 = metrics.accuracy_score(validationRank, pre2)
acc3 = metrics.accuracy_score(validationRank, pre3)
#acc4 = metrics.accuracy_score(validationRank, pre4)
#print('============')
#print(user)
#print(acc1)
#print(acc2)
#print(acc3)
#print(acc4)
prediction = float()
#if(acc4 >= acc1 and acc4 >= acc2 and acc4 >= acc3):
#prediction = forest.predict(dataTest)
#print('4')
if(acc1 >= acc2 and acc1>= acc3):
prediction = clf1.predict(dataTest)
#print('1')
elif(acc2 >= acc3):
prediction = clf2.predict(dataTest)
#print('2')
else:
prediction = logreg.predict(dataTest)
#print('3')
dictResults[user] = metrics.accuracy_score(testRank, prediction)
averageError += meanError(prediction,testRank)
errorSet += setError(prediction,testRank)
accuracy = float()
for user in dictResults.keys():
accuracy += dictResults[user]
#print bad users
#if dictResults[user]==0:
#print('BAD: '+user)
#print('========')
#print(user)
#print(dictResults[user])
accuracy /= len(listUsers)
print(accuracy)
print('=============ERROR ANALYSIS=========')
print(averageError/len(listUsers))
print(errorSet)
ensemblePerUser('1-2')
ensemblePerUser('3')
ensemblePerUser('4-5')