-
Notifications
You must be signed in to change notification settings - Fork 45
/
CrossValidator.cpp
224 lines (190 loc) · 7.47 KB
/
CrossValidator.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#include "CrossValidator.h"
#include <iostream>
#include <string>
#include <fstream>
#include <cmath>
#include <stdlib.h>
using namespace std;
/**************************************
* Function: validate
* -------------------------
* runs cross-validation on a set of victims -- prints results to stdout
*/
ErrorStruct CrossValidator::validate(vector<FeatureVector> *shuffled, unsigned int partitions,
unsigned int classifiers){
AdaBooster ada;
StrongClassifier strong; // strong classifier
ErrorStruct strong_error;
vector< vector< vector<double> > > strong_err_vec;
vector<FeatureVector> victim_list;
int training_count; // counts training sets so far
for (unsigned int test=0; test < partitions; test++){
training_count = 0;
for (unsigned int train=0; train < partitions; train++){
// make sure we're not training on the testing partition
if (test != train){
// give informative output
training_count++;
printf("Creating Training Set %d\r", training_count);
fflush(stdout);
// add current partition of victims to victim_list
for (unsigned int i=0; i<shuffled[train].size(); i++)
victim_list.push_back(shuffled[train][i]);
}
}
TrainingData train_data;
for (unsigned int i=0; i<victim_list.size(); i++)
train_data.addFeature(victim_list[i]);
printf("\rCreating Strong Classifier");
fflush(stdout);
strong = ada.getStrongClassifier(train_data,classifiers);
TrainingData test_data;
for (unsigned int i=0; i<shuffled[test].size(); i++) {
test_data.addFeature(shuffled[test][i]);
}
// get strong error
//vector< vector<double> > strong_err = ada.getStrongError(test_data, strong.weakClassifiers());
//strong_err_vec.push_back(strong_err);
victim_list.clear();
printf("\rTesting set %d completed \n", test+1);
strong_error = strong_error + (strong.errorForFeatures(test_data));
}
strong_error = strong_error / partitions;
// clean up
delete [] shuffled;
//double **strong_err_avg = get_strong_err_avg(strong_err_vec);
//print_strong_err_avg(strong_err_avg, strong_err_vec[0].size());
std::cout << strong_error.true_pos << " true positives" << std::endl;
std::cout << strong_error.false_pos << " false positives" << std::endl;
std::cout << strong_error.true_neg << " true negatives" << std::endl;
std::cout << strong_error.false_neg << " false negatives" << std::endl;
std::cout << strong_error.error * 100 << "% error" << std::endl;
std::cout << std::endl;
// finish cleaning up
//for (unsigned int i=0; i<strong_err_vec[0].size(); i++)
// delete [] strong_err_avg[i];
//delete [] strong_err_avg;
return strong_error;
}
/*****************************
* Function: shufflePeople
* "Shuffles" training data into partitions
*
* td: training data that we're going to extract
* partitions: # of partitions to shuffle into
*
* Returns: array of vectors of FeatureVectors. Each entry in array is a
* partition of "people" == vector<FeatureVector>
*/
vector<FeatureVector> *CrossValidator::shuffleTrainingData(const TrainingData &td, unsigned int partitions){
TrainingData copy = td;
vector<FeatureVector> *shuffled = new vector<FeatureVector>[partitions];
// traverse people
srand(time(NULL));
int count = 0;
while (copy.size() > 0) {
int randomElementLocation = rand() % copy.size();
shuffled[count % partitions].push_back(copy.removeFeatureAt(randomElementLocation));
count++;
}
/*
for (unsigned int i=0; i<td.size(); i++){
// split people into k categories
shuffled[i%partitions].push_back(*td.feature(i));
}
*/
return shuffled;
}
/***********************************
* Function: get_strong_err_avg
* ----------------------------
* given a vector of strong error vectors, we want to average them all
* together. Note that we return a double array of doubles, so we have to
* delete that allocation at some point (right now, in runValidate)
*/
double** CrossValidator::get_strong_err_avg(
vector< vector< vector<double> > > &strong){
double **avg;
// allocate memory for average
avg = new double*[strong[0].size()];
for(unsigned int i=0; i<strong[0].size(); i++){
avg[i] = new double[strong[0][0].size()];
}
// traverse grids and add values together
for (unsigned int j=0; j<strong.size(); j++){
for (unsigned int k=0; k<strong[j].size(); k++){
for (unsigned int m=0; m<strong[j][k].size(); m++){
if (j == 0)
avg[k][m] = strong[j][k][m];
else
avg[k][m] += strong[j][k][m];
}
}
}
// now divide by number of partitions (size of outer-most vector)
// (but we don't want to average true/false positive/negative
// values, so don't divide by those)
for (unsigned int k=0; k<strong[0].size(); k++){
for (unsigned int m=0; m<strong[0][0].size()-4; m++){
avg[k][m] /= strong.size();
}
}
if ( isnan(avg[strong[0].size()-1][1]) || // check if precision is nan
isnan(avg[strong[0].size()-1][2]) ) { // check if recall is nan
// recompute values
recompute_prcsn_recall(avg, strong[0].size());
}
return avg;
}
/*****************************************
* Function: recompute_prcsn_recall
* ---------------------------------
* Given avg from get_strong_err_avg and number or rows in the double array, we
* go through and recompute the precision and recall. This function only gets
* called when the precision or recall is NaN. This can be caused by, somewhere
* along the line, the precision or recall is NaN for one of the sets, and
* therefore when we try to average them all together, we get NaN for the whole
* thing. So, if that's the case, we just recalculate the stats and go on with
* our day.
*
* Note: this function is void but does alter the pointer it's given.
*
* Precision = (true pos / (true pos + false pos) )
* Recall = (true pos / (true pos + false neg) )
*/
void CrossValidator::recompute_prcsn_recall(double **avg, unsigned int rows){
int true_pos, true_neg, false_pos, false_neg;
double prcsn, recall;
for (unsigned int r=0; r<rows; r++){
// get true/false pos/neg values
true_pos = avg[r][3];
true_neg = avg[r][4];
false_pos = avg[r][5];
false_neg = avg[r][6];
// calculate new precision and recall
prcsn = (double) true_pos / (true_pos + false_pos);
recall = (double) true_pos / (true_pos + false_neg);
// set precision and recall
avg[r][1] = prcsn;
avg[r][2] = recall;
}
}
/**************************************
* Function: print_strong_err_avg
* ------------------------------
* given output from get_strong_err_avg() as input and the number of rows to
* print out, we print out all the good stuff
*/
void CrossValidator::print_strong_err_avg(double **avg, unsigned int row){
printf(" \r\tStrong Error Average\n");
printf("\t--------------------\n");
printf(" idx \terror\t prcsn \trecall\t tp \t tn \t fp \t fn\n");
printf("-----\t------\t------\t------\t----\t----\t----\t----\n");
for (unsigned int r=0; r<row; r++){
printf("[%03d]\t%6.2f\t%6.2f\t%6.2f\t%4d\t%4d\t%4d\t%4d\n",r,
avg[r][0]*100,avg[r][1]*100,avg[r][2]*100,
(int)avg[r][3],(int)avg[r][4],(int)avg[r][5],(int)avg[r][6]);
}
printf("-----\t------\t------\t------\t----\t----\t----\t----\n");
printf(" idx \terror\t prcsn \trecall\t tp \t tn \t fp \t fn\n");
}