-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbackpropagation-in-neural-networks.html
449 lines (402 loc) · 30.4 KB
/
backpropagation-in-neural-networks.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="author" content="Cathy Yeh" />
<meta property="og:type" content="article" />
<meta name="twitter:card" content="summary">
<meta name="keywords" content=", Theory, " />
<meta property="og:title" content="Backpropagation in neural networks "/>
<meta property="og:url" content="./backpropagation-in-neural-networks" />
<meta property="og:description" content="Overview We give a short introduction to neural networks and the backpropagation algorithm for training neural networks. Our overview is brief because we assume familiarity with partial derivatives, the chain rule, and matrix multiplication. We also hope this post will be a quick reference for those already familiar with the …" />
<meta property="og:site_name" content="EFAVDB" />
<meta property="og:article:author" content="Cathy Yeh" />
<meta property="og:article:published_time" content="2019-07-18T23:35:00-07:00" />
<meta name="twitter:title" content="Backpropagation in neural networks ">
<meta name="twitter:description" content="Overview We give a short introduction to neural networks and the backpropagation algorithm for training neural networks. Our overview is brief because we assume familiarity with partial derivatives, the chain rule, and matrix multiplication. We also hope this post will be a quick reference for those already familiar with the …">
<title>Backpropagation in neural networks · EFAVDB
</title>
<link href="//netdna.bootstrapcdn.com/twitter-bootstrap/2.3.2/css/bootstrap-combined.min.css" rel="stylesheet">
<link href="https://fonts.googleapis.com/css?family=Fira+Sans:300,400,700" rel="stylesheet" type='text/css' />
<link href="https://fonts.googleapis.com/css?family=Cardo:400,700" rel="stylesheet" type='text/css' />
<link rel="stylesheet" type="text/css" href="./theme/css/elegant.prod.css" media="screen">
<link rel="stylesheet" type="text/css" href="./theme/css/custom.css" media="screen">
<link rel="apple-touch-icon" sizes="180x180" href="./images/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="./images/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="./images/favicon-16x16.png">
<link rel="manifest" href="./images/site.webmanifest">
<link rel="mask-icon" href="./images/safari-pinned-tab.svg" color="#5bbad5">
<meta name="msapplication-TileColor" content="#da532c">
<meta name="theme-color" content="#ffffff">
</head>
<body>
<div id="content">
<div class="navbar navbar-static-top">
<div class="navbar-inner">
<div class="container-fluid">
<a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</a>
<a class="brand" href="./"><span class=site-name>EFAVDB</span></a>
<div class="nav-collapse collapse">
<ul class="nav pull-right top-menu">
<li >
<a href=
.
>Home</a>
</li>
<li ><a href="./pages/authors.html">Authors</a></li>
<li ><a href="./categories.html">Categories</a></li>
<li ><a href="./tags.html">Tags</a></li>
<li ><a href="./archives.html">Archives</a></li>
<li><form class="navbar-search" action="./search.html" onsubmit="return validateForm(this.elements['q'].value);"> <input type="text" class="search-query" placeholder="Search" name="q" id="tipue_search_input"></form></li>
</ul>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row-fluid">
<div class="span1"></div>
<div class="span10">
<article itemscope>
<div class="row-fluid">
<header class="page-header span8 offset2">
<h1>
<a href="./backpropagation-in-neural-networks">
Backpropagation in neural networks
</a>
</h1>
</header>
<div class="span2"></div>
</div>
<div class="row-fluid">
<div class="span8 offset2 article-content">
<h2>Overview</h2>
<p>We give a short introduction to neural networks and the backpropagation algorithm for training neural networks. Our overview is brief because we assume familiarity with partial derivatives, the chain rule, and matrix multiplication.</p>
<p>We also hope this post will be a quick reference for those already familiar with the notation used by Andrew Ng in his course on <a href="https://www.coursera.org/learn/neural-networks-deep-learning/">“Neural Networks and Deep Learning”</a>, the first in the deeplearning.ai series on Coursera. That course provides but doesn’t derive the vectorized form of the backpropagation equations, so we hope to fill in that small gap while using the same notation.</p>
<h2>Introduction: neural networks</h2>
<h3>A single neuron acting on a single training example</h3>
<p><img alt="single neuron" src="./wp-content/uploads/2019/07/single_neuron-e1563431237482.png"></p>
<p>The basic building block of a neural network is the composition of a nonlinear function (like a <a href="https://en.wikipedia.org/wiki/Sigmoid_function">sigmoid</a>, <a href="http://mathworld.wolfram.com/HyperbolicTangent.html">tanh</a>, or <a href="https://en.wikipedia.org/wiki/Rectifier_(neural_networks)">ReLU</a>) <span class="math">\(g(z)\)</span></p>
<div class="math">\begin{eqnarray} \nonumber
a^{[l]} = g(z^{[l]})
\end{eqnarray}</div>
<p>with a linear function acting on a (multidimensional) input, <span class="math">\(a\)</span>.
</p>
<div class="math">\begin{eqnarray} \nonumber
z^{[l]} = w^{[l]T} a^{[l-1]} + b^{[l]}
\end{eqnarray}</div>
<p>These building blocks, i.e. “nodes” or “neurons” of the neural network, are arranged in layers, with the layer denoted by superscript square brackets, e.g. <span class="math">\([l]\)</span> for the <span class="math">\(l\)</span>th layer. <span class="math">\(n_l\)</span> denotes the number of neurons in layer <span class="math">\(l\)</span>.</p>
<h3>Forward propagation</h3>
<p>Forward propagation is the computation of the multiple linear and nonlinear transformations of the neural network on the input data. We can rewrite the above equations in vectorized form to handle multiple training examples and neurons per layer as</p>
<div class="math">\begin{eqnarray} \tag{1} \label{1}
A^{[l]} = g(Z^{[l]})
\end{eqnarray}</div>
<p>with a linear function acting on a (multidimensional) input, <span class="math">\(A\)</span>.
</p>
<div class="math">\begin{eqnarray} \tag{2} \label{2}
Z^{[l]} = W^{[l]} A^{[l-1]} + b^{[l]}
\end{eqnarray}</div>
<p>The outputs or activations, <span class="math">\(A^{[l-1]}\)</span>, of the previous layer serve as inputs for the linear functions, <span class="math">\(z^{[l]}\)</span>. If <span class="math">\(n_l\)</span> denotes the number of neurons in layer <span class="math">\(l\)</span>, and <span class="math">\(m\)</span> denotes the number of training examples in one (mini)batch pass through the neural network, then the dimensions of these matrices are:</p>
<table>
<thead>
<tr>
<th>Variable</th>
<th>Dimensions</th>
</tr>
</thead>
<tbody>
<tr>
<td><span class="math">\(A^{[l]}\)</span></td>
<td>(<span class="math">\(n_l\)</span>, <span class="math">\(m\)</span>)</td>
</tr>
<tr>
<td><span class="math">\(Z^{[l]}\)</span></td>
<td>(<span class="math">\(n_l\)</span>, <span class="math">\(m\)</span>)</td>
</tr>
<tr>
<td><span class="math">\(W^{[l]}\)</span></td>
<td>(<span class="math">\(n_l\)</span>, <span class="math">\(n_{l-1}\)</span>)</td>
</tr>
<tr>
<td><span class="math">\(b^{[l]}\)</span></td>
<td>(<span class="math">\(n_l\)</span>, 1)</td>
</tr>
</tbody>
</table>
<p>For example, this neural network consists of only a single hidden layer with 3 neurons in layer 1.</p>
<p><img alt="neural network" src="./wp-content/uploads/2019/07/2layer_nn-e1563432145388.png"></p>
<p>The matrix <span class="math">\(W^{[1]}\)</span> has dimensions (3, 2) because there are 3 neurons in layer 1 and 2 inputs from the previous layer (in this example, the inputs are the raw data, <span class="math">\(\vec{x} = (x_1, x_2)\)</span>). Each row of <span class="math">\(W^{[1]}\)</span> corresponds to a vector of weights for a neuron in layer 1.</p>
<p><img alt="weights matrix" src="./wp-content/uploads/2016/06/weights_matrix-e1563432287786.png"></p>
<p>The final output of the neural network is a prediction in the last layer <span class="math">\(L\)</span>, and the closeness of the prediction <span class="math">\(A^{[L](i)}\)</span> to the true label <span class="math">\(y^{(i)}\)</span> for training example <span class="math">\(i\)</span> is quantified by a loss function <span class="math">\(\mathcal{L}(y^{(i)}, A^{[L](i)})\)</span>, where superscript <span class="math">\((i)\)</span> denotes the <span class="math">\(i\)</span>th training example. For classification, the typical choice for <span class="math">\(\mathcal{L}\)</span> is the <a href="https://en.wikipedia.org/wiki/Cross_entropy">cross-entropy loss</a> (log loss).</p>
<p>The cost <span class="math">\(J\)</span> is the average loss over all <span class="math">\(m\)</span> training examples in the dataset.</p>
<div class="math">\begin{eqnarray} \tag{3} \label{3}
J = \frac{1}{m} \sum_{i=1}^m \mathcal{L}(y^{(i)}, A^{[L](i)})
\end{eqnarray}</div>
<h3>Minimizing the cost with gradient descent</h3>
<p>The task of training a neural network is to find the set of parameters <span class="math">\(W\)</span> and <span class="math">\(b\)</span> (with different <span class="math">\(W\)</span> and <span class="math">\(b\)</span> for different nodes in the network) that will give us the best predictions, i.e. minimize the cost (\ref{3}).</p>
<p>Gradient descent is the workhorse that we employ for this optimization problem. We randomly initialize the parameters <span class="math">\(W\)</span> and <span class="math">\(b\)</span> for each node, then iteratively update the parameters by moving them in the direction that is opposite to the gradient of the cost.</p>
<div class="math">\begin{eqnarray} \nonumber
W_\text{new} &=& W_\text{previous} - \alpha \frac{\partial J}{\partial W} \\
b_\text{new} &=& b_\text{previous} - \alpha \frac{\partial J}{\partial b}
\end{eqnarray}</div>
<p>
<span class="math">\(\alpha\)</span> is the learning rate, a hyperparameter that needs to be tuned during the training process. The gradient of the cost is calculated by the backpropagation algorithm.</p>
<h2>Backpropagation equations</h2>
<p>These are the vectorized backpropagation (<span class="caps">BP</span>) equations which we wish to derive:</p>
<div class="math">\begin{eqnarray} \nonumber
dW^{[l]} &\equiv& \frac{\partial J}{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]}A^{[l-1]T} \tag{BP1} \label{BP1} \\
db^{[l]} &\equiv& \frac{\partial J}{\partial b^{[l]}} = \frac{1}{m} \sum_{i=1}^m dZ^{[l](i)} \tag{BP2} \label{BP2} \\
dA^{[l-1]} &\equiv& \frac{\partial \mathcal{L}}{\partial A^{[l-1]}} = W^{[l]T}dZ^{[l]} \tag{BP3} \label{BP3} \\
dZ^{[l]} &\equiv& \frac{\partial \mathcal{L}}{\partial Z^{[l]}} = dA^{[l]} * g'(Z^{[l]}) \tag{BP4} \label{BP4}
\end{eqnarray}</div>
<p>
The <span class="math">\(*\)</span> in the last line denotes element-wise multiplication.</p>
<p><span class="math">\(W\)</span> and <span class="math">\(b\)</span> are the parameters we want to learn (update), but the <span class="caps">BP</span> equations include two additional expressions for the partial derivative of the loss in terms of linear and nonlinear activations per training example since they are intermediate terms that appear in the calculation of <span class="math">\(dW\)</span> and <span class="math">\(db\)</span>.</p>
<h3>Chain rule</h3>
<p>We’ll need the chain rule for <a href="https://en.wikipedia.org/wiki/Total_derivative">total derivatives</a>, which describes how the change in a function <span class="math">\(f\)</span> with respect to a variable <span class="math">\(x\)</span> can be calculated as a sum over the contributions from intermediate functions <span class="math">\(u_i\)</span> that depend on <span class="math">\(x\)</span>:</p>
<div class="math">\begin{eqnarray} \nonumber
\frac{\partial f(u_1, u_2, ..., u_k)}{\partial x} = \sum_{i}^k \frac{\partial f}{\partial u_i} \frac{\partial u_i}{\partial x}
\end{eqnarray}</div>
<p>
where the <span class="math">\(u_i\)</span> are functions of <span class="math">\(x\)</span>. This expression reduces to the single variable chain rule when only one <span class="math">\(u_i\)</span> is a function of <span class="math">\(x\)</span>.</p>
<p>The gradients for every node can be calculated in a single backward pass through the network, starting with the last layer and working backwards, towards the input layer. As we work backwards, we cache the values of <span class="math">\(dZ\)</span> and <span class="math">\(dA\)</span> from previous calculations, which are then used to compute the derivative for variables that are further upstream in the computation graph. The dependency of the derivatives of upstream variables on downstream variables, i.e. cached derivatives, is manifested in the <span class="math">\(\frac{\partial f}{\partial u_i}\)</span> term in the chain rule. (Backpropagation is a dynamic programming algorithm!)</p>
<h3>The chain rule applied to backpropagation</h3>
<p>In this section, we apply the chain rule to derive the vectorized form of equations <span class="caps">BP</span>(1-4). Without loss of generality, we’ll index an element of the matrix or vector on the left hand side of <span class="caps">BP</span>(1-4); the notation for applying the chain rule is therefore straightforward because the derivatives are just with respect to scalars.</p>
<p><strong><span class="caps">BP1</span></strong>
The partial derivative of the cost with respect to the <span class="math">\(s\)</span>th component (corresponding to the <span class="math">\(s\)</span>th input) of <span class="math">\(\vec{w}\)</span> in the <span class="math">\(r\)</span>th node in layer <span class="math">\(l\)</span> is:</p>
<div class="math">\begin{eqnarray}
dW^{[l]}_{rs} &\equiv& \frac{\partial J}{\partial W^{[l]}_{rs}} \\
&=& \frac{1}{m} \sum_{i}^m \frac{\partial \mathcal{L}}{\partial W^{[l]}_{rs}} \\
&=& \frac{1}{m} \sum_{i}^m \frac{\partial \mathcal{L}}{\partial z^{[l]}_{ri}} \frac{\partial z^{[l]}_{ri}}{\partial W^{[l]}_{rs}} \tag{4} \label{4}
\end{eqnarray}</div>
<p>
The last line is due to the chain rule.</p>
<p>The first term in (\ref{4}) is <span class="math">\(dZ^{[l]}_{ri}\)</span> by definition (\ref{<span class="caps">BP4</span>}). We can simplify the second term of (\ref{4}) using the definition of the linear function (\ref{2}), which we rewrite below explicitly for the <span class="math">\(i\)</span>th training example in the <span class="math">\(r\)</span>th node in the <span class="math">\(l\)</span>th layer in order to be able to more easily keep track of indices when we take derivatives of the linear function:
</p>
<div class="math">\begin{eqnarray} \tag{5} \label{5}
Z^{[l]}_{ri} = \sum_j^{n_{l-1}} W^{[l]}_{rj} A^{[l-1]}_{ji} + b^{[l]}_r
\end{eqnarray}</div>
<p>
where <span class="math">\(n_{l-1}\)</span> denotes the number of nodes in layer <span class="math">\(l-1\)</span>.</p>
<p>Therefore,
</p>
<div class="math">\begin{eqnarray}
dW^{[l]}_{rs} &=& \frac{1}{m} \sum_{i}^m dZ^{[l]}_{ri} A^{[l-1]}_{si} \\
&=& \frac{1}{m} \sum_{i}^m dZ^{[l]}_{ri} A^{[l-1]T}_{is} \\
&=& \frac{1}{m} \left( dZ^{[l]} A^{[l-1]T} \right)_{rs}
\end{eqnarray}</div>
<p><strong><span class="caps">BP2</span></strong>
The partial derivative of the cost with respect to <span class="math">\(b\)</span> in the <span class="math">\(r\)</span>th node in layer <span class="math">\(l\)</span> is:</p>
<div class="math">\begin{eqnarray}
db^{[l]}_r &\equiv& \frac{\partial J}{\partial b^{[l]}_r} \\
&=& \frac{1}{m} \sum_{i}^m \frac{\partial \mathcal{L}}{\partial b^{[l]}_r} \\
&=& \frac{1}{m} \sum_{i}^m \frac{\partial \mathcal{L}}{\partial z^{[l]}_{ri}} \frac{\partial z^{[l]}_{ri}}{\partial b^{[l]}_r} \tag{6} \label{6} \\
&=& \frac{1}{m} \sum_{i}^m dZ^{[l]}_{ri}
\end{eqnarray}</div>
<p>
(\ref{6}) is due to the chain rule. The first term in (\ref{6}) is <span class="math">\(dZ^{[l]}_{ri}\)</span> by definition (\ref{<span class="caps">BP4</span>}). The second term of (\ref{6}) simplifies to <span class="math">\(\partial z^{[l]}_{ri} / \partial b^{[l]}_r = 1\)</span> from (\ref{5}).</p>
<p><strong><span class="caps">BP3</span></strong>
The partial derivative of the loss for the <span class="math">\(i\)</span>th example with respect to the nonlinear activation in the <span class="math">\(r\)</span>th node in layer <span class="math">\(l-1\)</span> is:</p>
<div class="math">\begin{eqnarray}
dA^{[l-1]}_{ri} &\equiv& \frac{\partial \mathcal{L}}{\partial A^{[l-1]}_{ri}} \\
&=& \sum_{k=1}^{n_l} \frac{\partial \mathcal{L}}{\partial Z^{[l]}_{ki}} \frac{\partial Z^{[l]}_{ki}}{\partial A^{[l-1]}_{ri}} \tag{7} \label{7} \\
&=& \sum_{k=1}^{n_l} dZ^{[l]}_{ki} W^{[l]}_{kr} \tag{8} \label{8} \\
&=& \sum_{k=1}^{n_l} W^{[l]T}_{rk} dZ^{[l]}_{ki} \\
&=& \left( W^{[l]T} dZ^{[l]} \right)_{ri}
\end{eqnarray}</div>
<p>
The application of the chain rule (\ref{7}) includes a sum over the nodes in layer <span class="math">\(l\)</span> whose linear functions take <span class="math">\(A^{[l-1]}_{ri}\)</span> as an input, assuming the nodes between layers <span class="math">\(l-1\)</span> and <span class="math">\(l\)</span> are fully-connected. The first term in (\ref{8}) is by definition <span class="math">\(dZ\)</span> (\ref{<span class="caps">BP4</span>}); from (\ref{5}), the second term in (\ref{8}) evaluates to <span class="math">\(\partial Z^{[l]}_{ki} / \partial A^{[l-1]}_{ri} = W^{[l]}_{kr}\)</span>.</p>
<p><strong><span class="caps">BP4</span></strong>
The partial derivative of the loss for the <span class="math">\(i\)</span>th example with respect to the linear activation in the <span class="math">\(r\)</span>th node in layer <span class="math">\(l\)</span> is:</p>
<div class="math">\begin{eqnarray}
dZ^{[l]}_{ri} &\equiv& \frac{\partial \mathcal{L}}{\partial Z^{[l]}_{ri}} \\
&=& \frac{\partial \mathcal{L}}{\partial A^{[l]}_{ri}} \frac{\partial A^{[l]}_{ri}}{\partial Z^{[l]}_{ri}} \\
&=& dA^{[l]}_{ri} * g'(Z^{[l]}_{ri})
\end{eqnarray}</div>
<p>
The second line is by the application of the chain rule (single variable since only a single nonlinear activation depends on directly on <span class="math">\(Z^{[l]}_{ri}\)</span>). <span class="math">\(g'(Z)\)</span> is the derivative of the nonlinear activation function with respect to its input, which depends on the nonlinear activation function that is assigned to that particular node, e.g. sigmoid vs. tanh vs. ReLU.</p>
<h3>Conclusion</h3>
<p>Backpropagation efficiently executes gradient descent for updating the parameters of a neural network by ordering and caching the calculations of the gradient of the cost with respect to the parameters in the nodes. This post is a little heavy on notation since the focus is on deriving the vectorized formulas for backpropagation, but we hope it complements the lectures in Week 3 of Andrew Ng’s <a href="https://www.coursera.org/learn/neural-networks-deep-learning/">“Neural Networks and Deep Learning”</a> course as well as the excellent, but even more notation-heavy, resources on matrix calculus for backpropagation that are linked below.</p>
<hr>
<p><strong>More resources on vectorized backpropagation</strong>
<a href="https://explained.ai/matrix-calculus/index.html">The matrix calculus you need for deep learning</a> - from explained.ai
<a href="http://neuralnetworksanddeeplearning.com/chap2.html">How the backpropagation algorithm works</a> - Chapter 2 of the Neural Networks and Deep Learning free online text</p>
<script type="text/javascript">if (!document.getElementById('mathjaxscript_pelican_#%@#$@#')) {
var align = "center",
indent = "0em",
linebreak = "false";
if (false) {
align = (screen.width < 768) ? "left" : align;
indent = (screen.width < 768) ? "0em" : indent;
linebreak = (screen.width < 768) ? 'true' : linebreak;
}
var mathjaxscript = document.createElement('script');
mathjaxscript.id = 'mathjaxscript_pelican_#%@#$@#';
mathjaxscript.type = 'text/javascript';
mathjaxscript.src = 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.3/latest.js?config=TeX-AMS-MML_HTMLorMML';
var configscript = document.createElement('script');
configscript.type = 'text/x-mathjax-config';
configscript[(window.opera ? "innerHTML" : "text")] =
"MathJax.Hub.Config({" +
" config: ['MMLorHTML.js']," +
" TeX: { extensions: ['AMSmath.js','AMSsymbols.js','noErrors.js','noUndefined.js'], equationNumbers: { autoNumber: 'none' } }," +
" jax: ['input/TeX','input/MathML','output/HTML-CSS']," +
" extensions: ['tex2jax.js','mml2jax.js','MathMenu.js','MathZoom.js']," +
" displayAlign: '"+ align +"'," +
" displayIndent: '"+ indent +"'," +
" showMathMenu: true," +
" messageStyle: 'normal'," +
" tex2jax: { " +
" inlineMath: [ ['\\\\(','\\\\)'] ], " +
" displayMath: [ ['$$','$$'] ]," +
" processEscapes: true," +
" preview: 'TeX'," +
" }, " +
" 'HTML-CSS': { " +
" availableFonts: ['STIX', 'TeX']," +
" preferredFont: 'STIX'," +
" styles: { '.MathJax_Display, .MathJax .mo, .MathJax .mi, .MathJax .mn': {color: 'inherit ! important'} }," +
" linebreaks: { automatic: "+ linebreak +", width: '90% container' }," +
" }, " +
"}); " +
"if ('default' !== 'default') {" +
"MathJax.Hub.Register.StartupHook('HTML-CSS Jax Ready',function () {" +
"var VARIANT = MathJax.OutputJax['HTML-CSS'].FONTDATA.VARIANT;" +
"VARIANT['normal'].fonts.unshift('MathJax_default');" +
"VARIANT['bold'].fonts.unshift('MathJax_default-bold');" +
"VARIANT['italic'].fonts.unshift('MathJax_default-italic');" +
"VARIANT['-tex-mathit'].fonts.unshift('MathJax_default-italic');" +
"});" +
"MathJax.Hub.Register.StartupHook('SVG Jax Ready',function () {" +
"var VARIANT = MathJax.OutputJax.SVG.FONTDATA.VARIANT;" +
"VARIANT['normal'].fonts.unshift('MathJax_default');" +
"VARIANT['bold'].fonts.unshift('MathJax_default-bold');" +
"VARIANT['italic'].fonts.unshift('MathJax_default-italic');" +
"VARIANT['-tex-mathit'].fonts.unshift('MathJax_default-italic');" +
"});" +
"}";
(document.body || document.getElementsByTagName('head')[0]).appendChild(configscript);
(document.body || document.getElementsByTagName('head')[0]).appendChild(mathjaxscript);
}
</script>
<p id="post-share-links">
Like this post? Share on:
<a href="https://twitter.com/intent/tweet?text=Backpropagation%20in%20neural%C2%A0networks&url=/backpropagation-in-neural-networks" target="_blank" rel="nofollow noopener noreferrer" title="Share on Twitter">Twitter</a>
|
<a href="https://www.facebook.com/sharer/sharer.php?u=/backpropagation-in-neural-networks" target="_blank" rel="nofollow noopener noreferrer" title="Share on Facebook">Facebook</a>
|
<a href="mailto:?subject=Backpropagation%20in%20neural%C2%A0networks&body=/backpropagation-in-neural-networks" target="_blank" rel="nofollow noopener noreferrer" title="Share via Email">Email</a>
</p>
<hr />
<div class="author_blurb">
<a href="" target="_blank" rel="nofollow noopener noreferrer">
<img src=/images/cy_efavdb_headshot.jpg alt="Cathy Yeh Avatar" title="Cathy Yeh">
<span class="author_name">Cathy Yeh</span>
</a>
Cathy Yeh got a PhD at UC Santa Barbara studying soft-matter/polymer physics. After stints in a translational modeling group at Pfizer in San Diego, Driver (a no longer extant startup trying to match cancer patients to clinical trials) in San Francisco, and Square, she is currently participating in the OpenAI Scholars program.
</div>
<hr/>
<aside>
<nav>
<ul class="articles-timeline">
<li class="previous-article">« <a href="./an-orientational-integral" title="Previous: An orientational integral">An orientational integral</a></li>
<li class="next-article"><a href="./timemarker-class-for-python" title="Next: TimeMarker class for python">TimeMarker class for python</a> »</li>
</ul>
</nav>
</aside>
</div>
<section id="article-sidebar" class="span2">
<h4>Published</h4>
<time itemprop="dateCreated" datetime="2019-07-18T23:35:00-07:00">Jul 18, 2019</time>
<h4>Category</h4>
<a class="category-link" href="./categories.html#theory-ref">Theory</a>
<h4>Contact</h4>
<div id="sidebar-social-link">
<a href="https://twitter.com/efavdb" title="" target="_blank" rel="nofollow noopener noreferrer">
<svg xmlns="http://www.w3.org/2000/svg" aria-label="Twitter" role="img" viewBox="0 0 512 512"><rect width="512" height="512" rx="15%" fill="#1da1f3"/><path fill="#fff" d="M437 152a72 72 0 0 1-40 12 72 72 0 0 0 32-40 72 72 0 0 1-45 17 72 72 0 0 0-122 65 200 200 0 0 1-145-74 72 72 0 0 0 22 94 72 72 0 0 1-32-7 72 72 0 0 0 56 69 72 72 0 0 1-32 1 72 72 0 0 0 67 50 200 200 0 0 1-105 29 200 200 0 0 0 309-179 200 200 0 0 0 35-37"/></svg>
</a>
<a href="https://github.com/efavdb" title="" target="_blank" rel="nofollow noopener noreferrer">
<svg xmlns="http://www.w3.org/2000/svg" aria-label="GitHub" role="img" viewBox="0 0 512 512"><rect width="512" height="512" rx="15%" fill="#1B1817"/><path fill="#fff" d="M335 499c14 0 12 17 12 17H165s-2-17 12-17c13 0 16-6 16-12l-1-50c-71 16-86-28-86-28-12-30-28-37-28-37-24-16 1-16 1-16 26 2 40 26 40 26 22 39 59 28 74 22 2-17 9-28 16-35-57-6-116-28-116-126 0-28 10-51 26-69-3-6-11-32 3-67 0 0 21-7 70 26 42-12 86-12 128 0 49-33 70-26 70-26 14 35 6 61 3 67 16 18 26 41 26 69 0 98-60 120-117 126 10 8 18 24 18 48l-1 70c0 6 3 12 16 12z"/></svg>
</a>
<a href="https://www.youtube.com/channel/UClfvjoSiu0VvWOh5OpnuusA" title="" target="_blank" rel="nofollow noopener noreferrer">
<svg xmlns="http://www.w3.org/2000/svg" aria-label="YouTube" role="img" viewBox="0 0 512 512" fill="#ed1d24"><rect width="512" height="512" rx="15%"/><path d="m427 169c-4-15-17-27-32-31-34-9-239-10-278 0-15 4-28 16-32 31-9 38-10 135 0 174 4 15 17 27 32 31 36 10 241 10 278 0 15-4 28-16 32-31 9-36 9-137 0-174" fill="#fff"/><path d="m220 203v106l93-53"/></svg>
</a>
</div>
</section>
</div>
</article>
<button onclick="topFunction()" id="myBtn" title="Go to top">▲</button>
</div>
<div class="span1"></div>
</div>
</div>
</div>
<footer>
<div>
<span class="site-name"><a href= .>EFAVDB</span> - Everybody's Favorite Data Blog</a>
</div>
<!-- <div id="fpowered"> -->
<!-- Powered by: <a href="http://getpelican.com/" title="Pelican Home Page" target="_blank" rel="nofollow noopener noreferrer">Pelican</a> -->
<!-- Theme: <a href="https://elegant.oncrashreboot.com/" title="Theme Elegant Home Page" target="_blank" rel="nofollow noopener noreferrer">Elegant</a> -->
<!-- -->
<!-- </div> -->
</footer> <script src="//code.jquery.com/jquery.min.js"></script>
<script src="//netdna.bootstrapcdn.com/twitter-bootstrap/2.3.2/js/bootstrap.min.js"></script>
<script>
function validateForm(query)
{
return (query.length > 0);
}
</script>
<script>
//** Scroll to top button **
//Get the button:
mybutton = document.getElementById("myBtn");
// When the user scrolls down 30px from the top of the document, show the button
window.onscroll = function() {scrollFunction()};
function scrollFunction() {
if (document.body.scrollTop > 30 || document.documentElement.scrollTop > 30) {
mybutton.style.display = "block";
} else {
mybutton.style.display = "none";
}
}
// When the user clicks on the button, scroll to the top of the document
function topFunction() {
document.body.scrollTop = 0; // For Safari
document.documentElement.scrollTop = 0; // For Chrome, Firefox, IE and Opera
}
</script>
<script>
(function () {
if (window.location.hash.match(/^#comment-\d+$/)) {
$('#comment_thread').collapse('show');
}
})();
window.onhashchange=function(){
if (window.location.hash.match(/^#comment-\d+$/))
window.location.reload(true);
}
$('#comment_thread').on('shown', function () {
var link = document.getElementById('comment-accordion-toggle');
var old_innerHTML = link.innerHTML;
$(link).fadeOut(200, function() {
$(this).text('Click here to hide comments').fadeIn(200);
});
$('#comment_thread').on('hidden', function () {
$(link).fadeOut(200, function() {
$(this).text(old_innerHTML).fadeIn(200);
});
})
})
</script>
</body>
<!-- Theme: Elegant built for Pelican
License : MIT -->
</html>