-
Notifications
You must be signed in to change notification settings - Fork 2
/
Ensemble_Framework.py
162 lines (134 loc) · 7.22 KB
/
Ensemble_Framework.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Ensemble-Framework: A collection of functions that implement Ensemble Learning techniques such as Voting and Boosting.
"""
import numpy as np
import itertools
from collections import defaultdict
import time
import random
import HMM_Framework
random_state = 22
random.seed(22)
### Boosting Ensemble - operates during training ###
def adaboost():
print("TODO")
### ###
### Voting Ensembles - operate at the end of the entire process ###
def ensemble_run(cross_val_prediction_matrix, mapping, golden_truth, mode, weights=None, use_log_prob=True, detailed=False):
"""
After training multiple models using the HMM framework we can add the following objects to a list: hmm.cross_val_prediction_matrix
hmm.ensemble_stored["Mapping"]
hmm.ensemble_stored["Curr_Cross_Val_Golden_Truth"]
where list[current_model][current_cross_val_fold]
Given those objects as parameters, calculates a voting ensemble of all the models.
mode:
sum: Weighted Voting Ensemble on probability matrices with a sum operation, also known as Soft Voting in literature
product: Weighted Voting Ensemble on probability matrices with a product operation, also known as Soft Voting in literature
borda: Borda count, a single-winner ranking method
"""
model_count = len(cross_val_prediction_matrix)
# Perform certain input validation checks
if model_count < 2:
print("\n--Warning: You have inputted less then two models, the Ensemble process is pointless.")
if weights != None:
if len(weights) != model_count:
raise ValueError("you must give as many weights as there are models or simply 'None'.")
else:
weights = [1.0/model_count] * model_count
try:
if isinstance(cross_val_prediction_matrix[0][0], np.ndarray) != True:
raise ValueError("the format of the prediction matrix seems to be wrong, are you certain there were multiple cross validation folds?")
else:
cross_val_folds = len(cross_val_prediction_matrix[0])
except TypeError:
raise ValueError("the format of the prediction matrix seems to be wrong, are you certain there were multiple cross validation folds?")
for curr_fold in range(cross_val_folds):
for model in range(model_count-1):
if np.array_equal(golden_truth[model][curr_fold], golden_truth[model+1][curr_fold]) != True: # or we could have used python's 'all()' function
raise ValueError("the golden truth labels across models and across cross validation folds are not identical.")
if mapping[model][curr_fold] != mapping[model+1][curr_fold]:
raise ValueError("the mapping across models and across cross validation folds is not identical.")
golden_truth = golden_truth[0] # Everything is OK, we only need to keep the golden truth and mapping of any one model
mapping = mapping[0]
#
if use_log_prob == False:
for curr_fold in range(cross_val_folds):
for model in range(model_count-1):
cross_val_prediction_matrix[model][curr_fold] = np.exp(cross_val_prediction_matrix[model][curr_fold])
# Create a HMM object just to use the 'result_metrics' function
dummy_object = HMM_Framework.HMM_Framework()
dummy_object.selected_model = "Ensemble of HMMs"
dummy_object.k_fold = cross_val_folds
# Run the Ensemble
for curr_fold in range(cross_val_folds):
time_counter = time.time()
if mode != "borda":
for model in range(model_count):
if model == 0:
ensemble_matrix = weights[0]*cross_val_prediction_matrix[0][curr_fold]
else:
if mode == "sum":
ensemble_matrix += weights[model]*cross_val_prediction_matrix[model][curr_fold]
elif mode == "product":
ensemble_matrix *= weights[model]*cross_val_prediction_matrix[model][curr_fold]
prediction = []
total_models = len(mapping[curr_fold])
# Old Way
#indices = np.argmax(ensemble_matrix, axis=1)
for instance in range(ensemble_matrix.shape[0]):
curr_instance = ensemble_matrix[instance, :]
# Comparison
if len(set(curr_instance)) == 1: # Ensure that we don't have n equal predictions, where argmax wouldn't work
index = random.randint(0, total_models - 1)
else:
index = np.argmax(curr_instance)
prediction.append(mapping[curr_fold][index])
else:
prediction = borda_count(curr_fold, model_count, cross_val_prediction_matrix, mapping)
dummy_object.result_metrics(golden_truth[curr_fold], prediction, time_counter)
dummy_object.print_average_results(decimals=3)
dummy_object.print_best_results(detailed=detailed, decimals=3)
def borda_count(curr_fold, model_count, cross_val_prediction_matrix, mapping):
"""
Implementation of Borda count, a single-winner ranking method.
"""
instance_count = cross_val_prediction_matrix[0][curr_fold].shape[0]
prediction = []
for instance in range(instance_count):
ballots = []
for model in range(model_count):
sorted_indices = list(np.argsort(cross_val_prediction_matrix[model][curr_fold][instance, :]))
sorted_indices.reverse() # Reverse
temp = ''
for j in range(len(sorted_indices)): # j is the current index and sorted_indices[j] is the current label
if j > 0:
if sorted_indices[j-1] == sorted_indices[j]:
temp += '='
else:
temp += '>'
temp += mapping[curr_fold][sorted_indices[j]]
ballots.append(temp)
# ballots = ['A>B>C>D>E',
# 'A>B>C>D=E',
# 'A>B=C>D>E',
# 'B>A>C>D',
# ]
prediction.append(max(_borda_count_tally(ballots), key=_borda_count_tally(ballots).get)) # Maximum value is not always on the leftmost position of the Dict so we need to find it
return(prediction)
def _borda_count_main(ballot):
n = len([c for c in ballot if c.isalpha()]) - 1
score = itertools.count(n, step = -1)
result = {}
for group in [item.split('=') for item in ballot.split('>')]:
s = sum(next(score) for item in group)/float(len(group))
for pref in group:
result[pref] = s
return result
def _borda_count_tally(ballots):
result = defaultdict(int)
for ballot in ballots:
for pref,score in _borda_count_main(ballot).items():
result[pref]+=score
result = dict(result)
return result
### ###