forked from pjreddie/vision-hw4
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier.c
258 lines (230 loc) · 6.98 KB
/
classifier.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#include <math.h>
#include <stdlib.h>
#include "image.h"
#include "matrix.h"
// Run an activation function on each element in a matrix,
// modifies the matrix in place
// matrix m: Input to activation function
// ACTIVATION a: function to run
void activate_matrix(matrix m, ACTIVATION a)
{
int i, j;
for(i = 0; i < m.rows; ++i){
double sum = 0;
for(j = 0; j < m.cols; ++j){
double x = m.data[i][j];
if(a == LOGISTIC){
m.data[i][j] = 1/(1 + exp(-x));
} else if (a == RELU){
m.data[i][j] = x < 0 ? 0 : x;
} else if (a == LRELU){
m.data[i][j] = x < 0 ? .1*x : x;
} else if (a == SOFTMAX){
m.data[i][j] = exp(x);
}
sum += m.data[i][j];
}
if (a == SOFTMAX) {
for (j = 0; j < m.cols; ++j) {
m.data[i][j] = m.data[i][j]/sum;
}
}
}
}
// Calculates the gradient of an activation function and multiplies it into
// the delta for a layer
// matrix m: an activated layer output
// ACTIVATION a: activation function for a layer
// matrix d: delta before activation gradient
void gradient_matrix(matrix m, ACTIVATION a, matrix d)
{
int i, j;
for(i = 0; i < m.rows; ++i){
for(j = 0; j < m.cols; ++j){
double x = m.data[i][j];
double y = d.data[i][j];
if (a == LOGISTIC) {
d.data[i][j] = y * x * (1-x);
} else if (a == RELU) {
d.data[i][j] = x > 0 ? y : 0;
} else if (a == LRELU) {
d.data[i][j] = x > 0 ? y : .1*y;
} else {
d.data[i][j] = y;
}
}
}
}
// Forward propagate information through a layer
// layer *l: pointer to the layer
// matrix in: input to layer
// returns: matrix that is output of the layer
matrix forward_layer(layer *l, matrix in)
{
l->in = in; // Save the input for backpropagation
matrix out = matrix_mult_matrix(in, l->w);
activate_matrix(out, l->activation);
free_matrix(l->out);// free the old output
l->out = out; // Save the current output for gradient calculation
return out;
}
// Backward propagate derivatives through a layer
// layer *l: pointer to the layer
// matrix delta: partial derivative of loss w.r.t. output of layer
// returns: matrix, partial derivative of loss w.r.t. input to layer
matrix backward_layer(layer *l, matrix delta)
{
// 1.4.1
// delta is dL/dy
gradient_matrix(l->out, l->activation, delta);
// 1.4.2
free_matrix(l->dw);
matrix dw = matrix_mult_matrix(transpose_matrix(l->in), delta);
l->dw = dw;
// 1.4.3
// TODO: finally, calculate dL/dx and return it.
matrix dx = matrix_mult_matrix(delta, transpose_matrix(l->w)); // replace this
return dx;
}
// Update the weights at layer l
// layer *l: pointer to the layer
// double rate: learning rate
// double momentum: amount of momentum to use
// double decay: value for weight decay
void update_layer(layer *l, double rate, double momentum, double decay)
{
// Calculate Δw_t = dL/dw_t - λw_t + mΔw_{t-1}
// save it to l->v
matrix mat = axpy_matrix(-decay, l->w, l->dw);
matrix v = axpy_matrix(momentum, l->v, mat);
free_matrix(l->v);
free_matrix(mat);
l->v = v;
// Update l->w
matrix w = axpy_matrix(rate, l->v, l->w);
free_matrix(l->w);
l->w = w;
// Remember to free any intermediate results to avoid memory leaks
}
// Make a new layer for our model
// int input: number of inputs to the layer
// int output: number of outputs from the layer
// ACTIVATION activation: the activation function to use
layer make_layer(int input, int output, ACTIVATION activation)
{
layer l;
l.in = make_matrix(1,1);
l.out = make_matrix(1,1);
l.w = random_matrix(input, output, sqrt(2./input));
l.v = make_matrix(input, output);
l.dw = make_matrix(input, output);
l.activation = activation;
return l;
}
// Run a model on input X
// model m: model to run
// matrix X: input to model
// returns: result matrix
matrix forward_model(model m, matrix X)
{
int i;
for(i = 0; i < m.n; ++i){
X = forward_layer(m.layers + i, X);
}
return X;
}
// Run a model backward given gradient dL
// model m: model to run
// matrix dL: partial derivative of loss w.r.t. model output dL/dy
void backward_model(model m, matrix dL)
{
matrix d = copy_matrix(dL);
int i;
for(i = m.n-1; i >= 0; --i){
matrix prev = backward_layer(m.layers + i, d);
free_matrix(d);
d = prev;
}
free_matrix(d);
}
// Update the model weights
// model m: model to update
// double rate: learning rate
// double momentum: amount of momentum to use
// double decay: value for weight decay
void update_model(model m, double rate, double momentum, double decay)
{
int i;
for(i = 0; i < m.n; ++i){
update_layer(m.layers + i, rate, momentum, decay);
}
}
// Find the index of the maximum element in an array
// double *a: array
// int n: size of a, |a|
// returns: index of maximum element
int max_index(double *a, int n)
{
if(n <= 0) return -1;
int i;
int max_i = 0;
double max = a[0];
for (i = 1; i < n; ++i) {
if (a[i] > max){
max = a[i];
max_i = i;
}
}
return max_i;
}
// Calculate the accuracy of a model on some data d
// model m: model to run
// data d: data to run on
// returns: accuracy, number correct / total
double accuracy_model(model m, data d)
{
matrix p = forward_model(m, d.X);
int i;
int correct = 0;
for(i = 0; i < d.y.rows; ++i){
if(max_index(d.y.data[i], d.y.cols) == max_index(p.data[i], p.cols)) ++correct;
}
return (double)correct / d.y.rows;
}
// Calculate the cross-entropy loss for a set of predictions
// matrix y: the correct values
// matrix p: the predictions
// returns: average cross-entropy loss over data points, 1/n Σ(-ylog(p))
double cross_entropy_loss(matrix y, matrix p)
{
int i, j;
double sum = 0;
for(i = 0; i < y.rows; ++i){
for(j = 0; j < y.cols; ++j){
sum += -y.data[i][j]*log(p.data[i][j]);
}
}
return sum/y.rows;
}
// Train a model on a dataset using SGD
// model m: model to train
// data d: dataset to train on
// int batch: batch size for SGD
// int iters: number of iterations of SGD to run (i.e. how many batches)
// double rate: learning rate
// double momentum: momentum
// double decay: weight decay
void train_model(model m, data d, int batch, int iters, double rate, double momentum, double decay)
{
int e;
for(e = 0; e < iters; ++e){
data b = random_batch(d, batch);
matrix p = forward_model(m, b.X);
fprintf(stderr, "%06d: Loss: %f\n", e, cross_entropy_loss(b.y, p));
matrix dL = axpy_matrix(-1, p, b.y); // partial derivative of loss dL/dy
backward_model(m, dL);
update_model(m, rate/batch, momentum, decay);
free_matrix(dL);
free_data(b);
}
}