-
Notifications
You must be signed in to change notification settings - Fork 1
/
stochastic.tex
executable file
·3373 lines (3251 loc) · 196 KB
/
stochastic.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[12pt]{article}
\usepackage[left=2.5cm,top=2.5cm,right=2.5cm,bottom=2.5cm]{geometry}
\usepackage{cancel}
\usepackage{bm}
\usepackage{moreverb}
\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{color}
\usepackage{ctable}
\definecolor{MyDarkBlue}{rgb}{0.15,0.15,0.45}
\usepackage[linktocpage=true]{hyperref}
\hypersetup{
colorlinks=true,
citecolor=MyDarkBlue,
linkcolor=MyDarkBlue,
urlcolor=MyDarkBlue,
}
\def\argmin{\mathop{\rm argmin}}
\def\argmax{\mathop{\rm argmax}}
\def\tr{\mathop{\rm tr}\nolimits}
\def\Re{\mathop{\rm Re}\nolimits}
\def\diag{\mathop{\rm diag}\nolimits}
\def\var{\mathop{\rm var}\nolimits}
\def\cov{\mathop{\rm cov}\nolimits}
\def\RGP{\mathop{\rm RGP}\nolimits}
\def\Geom{\mathop{\rm Geom}\nolimits}
\def\B{\mathop{\rm B}\nolimits}
\def\Hyp{\mathop{\rm Hyp}\nolimits}
\def\CV{\mathop{\rm CV}\nolimits}
\def\EV{\mathop{\rm EV}\nolimits}
\theoremstyle{plain}% default
\newtheorem{thm}{Theorem}[section]
\newtheorem{lem}[thm]{Lemma}
\newtheorem{prop}[thm]{Proposition}
\newtheorem*{cor}{Corollary}
\theoremstyle{definition}
\newtheorem{defn}{Definition}[section]
\newtheorem{conj}{Conjecture}[section]
\newtheorem{exmp}{Example}[section]
\theoremstyle{remark}
\newtheorem*{remk}{Remark}
\newtheorem*{note}{Note}
\newtheorem{case}{Case}
\newcommand\ddfrac[2]{\frac{\displaystyle #1}{\displaystyle #2}}
\begin{document}
\title{Notes on stochastic processes}
\author{James Stokes}
\maketitle
\tableofcontents
\section{Probability theory}
\subsection{Probability space}
The sample space $\Omega$ is the set of all outcomes of an experiment. An event $A$ is a subset of $\Omega$. An event $A$ is said to occur if the outcome of the experiment lies in $A$. A probability distribution is a function on the collection of events such that $P(\Omega) =1$, $P(A) \geq 0$ for all $A$ and for any countable union of disjoint sets
\begin{align}
P\Bigl(\coprod_i A_i\Bigr)
& = \sum_i P(A_i).
\end{align}
It follows from the above axioms that $P(A) \leq 1$ since $1=P(\Omega) = P(A \cup A^c) = P(A) + P(A^c) $ and thus since $P(A^c) \geq 0$. By expressing $A \cup B$ as a disjoint union of sets,
\begin{align}
A \cup B
& = (A \cap B^c) \sqcup (A^c \cap B) \sqcup (A \cap B)
\end{align}
we obtain
\begin{align}
P(A \cup B)
& = P(A \cap B^c) + P(A^c \cap B) + P(A \cap B) \\
& = \left[P(A) + P(B) - 2P(A \cap B) + \right] + P(A \cap B) \\
& = P(A) + P(B) - P(A \cap B).
\end{align}
\subsection{Conditional probability}
Denote by $P(A|B)$ the probability of $A$ occurring given that $B$ has occurred. This is the probability that the outcome of the experiment lies in $A$ given that the outcome is known to lie in $B$ It is given by the formula
\begin{equation}
P(A|B) = \frac{P(A \cap B)}{P(B)}.
\end{equation}
The above formula has the corollary
\begin{equation}
P(A \cap B) = P(B) P(A| B) = P(A)P(B|A).
\end{equation}
This can be thought of as a generalization of the formula $P(A \cap B) = P(A)P(B)$ for independent events. Iterating the definition of conditional probability we obtain
\begin{align}
P(A \cap B \cap C)
& = P(A \cap B) P(C | A\cap B) \\
& = P(A)P(B|A)P(C | A\cap B).
\end{align}
\begin{exmp}
A coin is tossed twice. Find the probability of the event of two heads given (a) the first toss is a head (b) at least one toss is a head. We have
\begin{align}
P(H_1 \cap H_2 |H_1)
& = \frac{P(H_1 \cap H_2 \cap H_1)}{P(H_1)} = \frac{P(H_1 \cap H_2)}{P(H_1)} \\
& = \frac{p^2}{p^2 + p(1-p)} = p \\
P(H_1 \cap H_2 |H_1 \cup H_2)
& = \frac{P(H_1 \cap H_2 \cap (H_1\cup H_2))}{P(H_1\cup H_2)} = \frac{P(H_1 \cap H_2)}{P(H_1\cup H_2)} \\
& = \frac{p^2}{p^2 + 2p(1-p)} = \frac{p}{1-p}.
\end{align}
Since $0 \leq p \leq 1$ we see that (a) is at least as likely as (b).
\end{exmp}
\subsubsection{Bayes' theorem}
Suppose we partition the sample space into a set of disjoint events called priors $\sqcup_i A_i = \Omega$. Then the partition theorem states that
\begin{equation}
P(B) = \sum_i P(B|A_i)P(A_i).
\end{equation}
Using this fact allows us to express the conditional probability $P(A_i |B)$ in terms of the conditional probabilities $P(B|A_j)$ as follows
\begin{align}
P(A_i |B)
= \frac{P(A_i \cap B)}{P(B)}
= \frac{P(A_i) P(B|A_i)}{\sum_j P(A_j) P(B|A_j) }.
\end{align}
Applying the partition theorem to the expectation of a discrete random variable $X$ gives
\begin{align}
\mathbb{E}[X]
& = \sum_i\sum_x x P(X=x|A_i)P(A_i) \\
& \sum_i \mathbb{E}[X\,|\,A_i]P(A_i).
\end{align}
Let $f_{X,Y}(x,y)$ be the joint probability density for two random variables $X$ and $Y$. The conditional probability density is defined as
\begin{equation}
f_{X|Y}(x | y) = \frac{f_{X,Y}(x,y)}{f_Y(y)}.
\end{equation}
\begin{exmp} (Monty hall problem). Suppose without loss of generality that the contestant chooses door 1 (if the contestant chooses any other door then we just relabel the doors so that the contestant chose door 1). We clearly have
\begin{equation}
P(\textrm{behind 1}) = P(\textrm{behind 2}) = P(\textrm{behind 3}) = \frac{1}{3}.
\end{equation}
Now consider the probabilities conditioned on the fact that the host chooses a door. Note that the host can only choose door 2 or 3 since the contestant has selected door 1. We have
\begin{align}
P(\textrm{behind 1}|\textrm{chooses 2})
& = \frac{P(\textrm{chooses 2}|\textrm{behind 1})P(\textrm{behind 1})}{P(\textrm{chooses 2})}
= \frac{\frac{1}{2}\frac{1}{3}}{\frac{1}{2}} = 1/3 \\
P(\textrm{behind 1}|\textrm{chooses 3})
& = \frac{P(\textrm{chooses 3}|\textrm{behind 1})P(\textrm{behind 1})}{P(\textrm{chooses 3})} = \frac{\frac{1}{2}\frac{1}{3}}{\frac{1}{2}} = 1/3 .
\end{align}
Now
\begin{align}
P(\textrm{behind 2}|\textrm{chooses 3})
& = \frac{P(\textrm{chooses 3}|\textrm{behind 2})P(\textrm{behind 2})}{P(\textrm{chooses 3})} = \frac{1\frac{1}{3}}{\frac{1}{2}} = 2/3 \\
P(\textrm{behind 3}|\textrm{chooses 2})
& = \frac{P(\textrm{chooses 2}|\textrm{behind 3})P(\textrm{behind 3})}{P(\textrm{chooses 2})}= \frac{1\frac{1}{3}}{\frac{1}{2}} = 2/3.
\end{align}
The only remaining possibilities have vanishing probability because the host will never choose the door behind which the prize lies; that is,
\begin{equation}
P(\textrm{behind 2}|\textrm{chooses 2})=P(\textrm{behind 3}|\textrm{chooses 3}) = 0
\end{equation}
\end{exmp}
\subsection{Discrete probability distributions}
\subsubsection{Bernoulli}
Consider a biased coin that lands heads with probability $p$. This experiment can be modeled by a Bernoulli distribution which describes the probability of success in a single experiment with success probability $p$. A Bernoulli random variable $X \sim \B(1,p)$ has probability distribution
\begin{equation}
P(X=1)=p, \quad \quad P(X=0) = 1-p.
\end{equation}
The mean and variance are easily calculated to be
\begin{equation}
\mathbb{E}[X] = p1 + (1-p)0=p, \quad \quad \var(X) = \mathbb{E}[X^2]-\mathbb{E}[X]^2 = p-p^2=p(1-p).
\end{equation}
\subsubsection{Binomial and Geometric}
Now consider a sequence of $k$ biased coin tosses where heads lands with probability $p$. This can can be modeled as a sequence of $k$ independent and identically distributed Bernoulli random variables $(X_1,\ldots,X_k)$ with common success probability $P(X_i=1) = p$. The binomial distribution $\B(k,p)$ characterizes the number of heads that land after $k$ tosses. It also describes the number of successes in $k$ draws with replacement, where the success probability on each draw is $p$. A binomial random variable $X\sim \B(k,p)$ has probability distribution
\begin{equation}
P(X=j) = {k\choose{j}} p^j (1-p)^{k-j}.
\end{equation}
This gives the probability of $j$ heads appearing in $k$ flips of an unfair coin. The ${k\choose{i}}$ factor counts the number of ways that the $i$ heads can be distributed in the sequence of $k$ trials.
Note that the binomial random variable can be represented as a sum of $k$ iid Bernoulli random variables
\begin{equation}
X= \sum_{j=1}^k X_j
\end{equation}
and thus
\begin{align}
\mathbb{E}[X]
& = kp, \\
\var(X)
& = \sum_{i=1}^k\sum_{j=1}^k \cov(X_i,X_j) = \sum_{i=1}^k \var(X_i) = kp(1-p).
\end{align}
The geometric distribution $\Geom(p)$ describes the probability of success on the $k$th trial after $k-1$ failures with success probability $p$. Given that a biasses coin toss yields tails, the binomial distribution characterizes the number of tosses until the next head. A geometric random variable $X \sim \Geom(p)$ has probability distribution
\begin{equation}
P(X=k) = (1-p)^{k-1}p.
\end{equation}
An example is the probability that an unfair coin yields heads on the $k$th flip if the probability of heads is $p$. The expectation value
\begin{equation}
\mathbb{E}[X] = \sum_{k=1}^\infty P(X=k)k.
\end{equation}
can be computed by differentiating the following identity with respect to $p$,
\begin{align}
1
& = \sum_{k=1}^\infty (1-p)^{k-1}p \\
0
& = \sum_{k=1}^\infty \left[(1-p)^{k-1}-(k-1)p(1-p)^{k-2}\right] \\
& = 1/p - \mathbb{E}[X].
\end{align}
It can also be computed by representing it in terms of conditional expectations
\begin{align}
\mathbb{E}[X]
& = P(X=1)\mathbb{E}[X\,|\,X=1]+P(X>1)\mathbb{E}[X\,|\,X>1] \\
& = P(X=1)\mathbb{E}[X\,|\,X=1]+P(X>1)\left(\mathbb{E}[X-1\,|\,X-1>0]+1\right) \\
& = p + (1-p)\left(\mathbb{E}[X] + 1\right)
\end{align}
where we have used the memorylessness property $P(X-n|X>n)=P(X)$. Solving gives
\begin{equation}
\mathbb{E}[X] = 1/p.
\end{equation}
The variance can be computed using the trick
\begin{equation}
\var(X) = \mathbb{E}[X^2]-\mathbb{E}[X]^2 = \mathbb{E}[X(X-1)]+\mathbb{E}[X]-\mathbb{E}[X]^2
\end{equation}
We have
\begin{align}
\mathbb{E}[X]
& = \sum_{k=1}^\infty k(k-1)(1-p)^{k-1}p \\
& = p(1-p)[1\cdot 2+2\cdot 3(1-p)^2 + \cdots]\\
& = \frac{2p(1-p)}{[1-(1-p)]^3}
\end{align}
and thus
\begin{equation}
\var(X) = \frac{1-p}{p^2}.
\end{equation}
\begin{exmp}
Let $X$ represent the number of tosses until two successive heads appear in a row. We partition on first and second tosses and use the memorylessness property $\mathbb{E}[X\,|\,T] = \mathbb{E}[X] +1$, $\mathbb{E}[X\,|\,HT] = \mathbb{E}[X] +2$ to give
\begin{align}
\mathbb{E}[X]
& = \mathbb{E}[X\,|\,T]P(T) + \mathbb{E}[X\,|\,T]P(T) \\
& = \mathbb{E}[X\,|\,T]P(T) + \mathbb{E}[X\,|\,HT]P(HT) + \mathbb{E}[X\,|\,HH]P(HH) \\
& = (1+\mathbb{E}[X])(1-p)+(2+\mathbb{E}[X])p(1-p)+ 2p^2.
\end{align}
This implies
\begin{equation}
\mathbb{E}[X] = 1/p + 1/p^2.
\end{equation}
Alternatively let $X_n$ denote the number of flips required to obtain $n$ successive heads. We have
\begin{align}
\mathbb{E}[X_2]
& = \sum_{k=1}^\infty \mathbb{E}[X_2\,|\,X_1=k]P(X_1=k)
\end{align}
Now
\begin{align}
\mathbb{E}[X_2\,|\,X_1=k]
& = p(k+1) + (1-p)(k+1 + \mathbb{E}[X_2]) \\
& = 1+k+ (1-p)\mathbb{E}[X_2]
\end{align}
and thus
\begin{align}
\mathbb{E}[X_2]
& = \sum_{k=1}^\infty P(X_1=k)\left[1+k+ (1-p)\mathbb{E}[X_2]\right] \\
& = 1 + \mathbb{E}[X_1]+ (1-p)\mathbb{E}[X_2].
\end{align}
Setting $\mathbb{E}[X_1]=1/p$ and solving we obtain the result.
\end{exmp}
\subsubsection{Uniform}
The discrete uniform distribution on $[a,b]$ (where $a,b\in\mathbb{Z}$) is given by
\begin{equation}
P(X=k) = \begin{cases}
\frac{1}{b-a+1},& k = a,\ldots,b \\
0, & \mathrm{o.w.}
\end{cases}
\end{equation}
We have
\begin{equation}
\mathbb{E}[X] = \frac{1}{b-a+1}\sum_{k=a}^b k = \frac{a+b}{2}
\end{equation}
and
\begin{equation}
\var(X) = \frac{(a-b)(a-b-2)}{12}.
\end{equation}
\subsubsection{Multinomial}
Consider a sequence of $k$ independent and identically distributed random variables $(X_1,\ldots,X_k)$ with common probability distribution,
\begin{equation}
P(X_i = j) = p_j, \quad \quad j \in \{1,\ldots,n\}, \quad \quad \sum_{j=1}^n p_j = 1.
\end{equation}
The multinomial distribution is thus a generalization of the binomial distribution where each random variable can take on $n$ values. We have,
\begin{equation}
P(X_1=j_1,\ldots X_n = j_n) = \frac{k!}{j_1!\cdots j_n!}p_1^{j_1}\cdots p_n^{j_n}, \quad \quad \sum_{i=1}^n j_i = k.
\end{equation}
The factor
\begin{equation}
\frac{k!}{j_1!\cdots j_n!} = {k\choose{j_1}}{k-j_1\choose{j_2}}{k-j_1-j_2\choose{j_3}}\cdots{k-j_1-\cdots-j_{n-1}\choose{j_n}}
\end{equation}
is the number of ways to partition a $k$-element set into $n$ disjoint subsets, with $j_i$ elements in the $i$th subset.
\subsubsection{Hypergeometric}
The hypergeometric distribution describes the probability of picking $j$ red balls, after drawing $k$ times without replacement from an urn containing $n$ balls, of which $m$ are red,
\begin{equation}
P(X=j) = \frac{{m\choose{j}}{N-m\choose{k-j}}}{{N\choose{k}}},
\end{equation}
where ${m\choose{j}}$ is the number of ways of choosing $j$ balls from the $m$ red ones and ${N-m\choose{k-j}}$ is the number of ways of choosing $k-j$ balls from $N-m$ non-red ones. In the limit $N \to \infty$ holding $p\equiv m/N$ fixed the hypergeometric distribution approaches the binomial distribution; that is,
\begin{equation}
\lim_{N\to\infty}\frac{{m\choose{j}}{N-m\choose{k-j}}}{{N\choose{k}}} = {k\choose{j}} p^j (1-p)^{k-j}.
\end{equation}
\subsection{Continuous probability distributions}
\subsubsection{Normal}
A normal random variable $X \sim \mathcal{N}(\mu,\sigma^2)$ with mean $\mu$ and variance $\sigma^2$ has pdf
\begin{equation}
f_X(x) = \frac{1}{\sigma\sqrt{2\pi}}e^{-(x-\mu)^2/(2\sigma^2)}
\end{equation}
\subsubsection{Poisson}
The Poisson distribution can be thought of as the limit of the binomial distribution $\B(k,p)$ where $k\to\infty$ holding the expectation $\mathbb{E}[X] = kp = \lambda$ fixed,
\begin{align}
P(X=j)
& = \lim_{k\to\infty}{k\choose{j}}\left(\frac{\lambda}{k}\right)^j\left(1-\frac{\lambda}{k}\right)^{k-j} \\
& = \lim_{k\to\infty}\frac{k(k-1)\cdots(k-j+1)}{k^j}\frac{\lambda^j}{j!}\left(1-\frac{\lambda}{k}\right)^{k} \left(1-\frac{\lambda}{k}\right)^{-j} \\
& = \lim_{k\to\infty}\frac{(k^j + \cdots)}{k^j}\frac{\lambda^j}{j!}\left(1-\frac{\lambda}{k}\right)^{k} \left(1-\frac{\lambda}{k}\right)^{-j} \\
& = \frac{\lambda^je^{-\lambda}}{j!}.
\end{align}
The variance is also easily found as a limit of the binomial variance
\begin{equation}
\var(X) = \lim_{k\to\infty} kp(1-p) = \lim_{k\to\infty} \lambda\left(1-\frac{\lambda}{k}\right) = \lambda.
\end{equation}
If we relabel $\mathbb{E}[X] = \lambda t$ then $\lambda$ has the interpretation of the arrival rate and then the number of arrivals in an interval of time $\tau$ is given by
\begin{equation}
P(k,\tau) = \frac{(\lambda \tau)^ke^{-\lambda \tau}}{k!}.
\end{equation}
Notice for comparison if we take the limit of $\B(k,p)$ as $k\to\infty$ holding $p$ fixed we obtain a normal distribution by the central limit theorem.
\subsection{Sums of random variables}
If $X$ and $Y$ are independent random variables and $W = X+Y$ then
\begin{align}
P(W=w)
& = P(X+Y = w) \\
& = \sum_x P(X=x)P(Y=w-x).
\end{align}
For continuous distributions this formula becomes
\begin{equation}
f_W(w) = \int_{-\infty}^{\infty} dx\, f_X(x)f_Y(w-x).
\end{equation}
The moment generating function for a random variable $X$ is given by
\begin{equation}
M_X(t) = \mathbb{E}[e^{tX}].
\end{equation}
If $X$ and $Y$ and independent random variables and $W = X+Y$ then
\begin{equation}
M_W(t) = M_X(t)M_Y(t).
\end{equation}
The probability generating function is given by
\begin{equation}
g_X(s) = M_X(\log s) = \mathbb{E}[s^X].
\end{equation}
\begin{exmp}
Let $X$ and $Y$ be random variables that represent the rolls of two indepedent six-sided die with faces $\{1,2,3,4,5,6\}$. We can obtain the distribution of $X+Y$ by computing its probability generating function
\begin{align}
g_{X+Y}(s)
& = g_X(s)g_Y(s) \\
& = g_X(s)^2 \\
& = \frac{s^2}{36}+\frac{s^3}{18}+\frac{s^4}{12}+\frac{s^5}{9}+\frac{5 s^6}{36}+\frac{s^7}{6}+\frac{5 s^8}{36}+\frac{s^9}{9}+\frac{s^{10}}{12}+\frac{s^{11}}{18}+\frac{s^{12}}{36}
\end{align}
Factoring this expression we obtain
\begin{align}
g_{X+Y}(s)
& = \frac{1}{36} s^2 (1+s)^2 \left(1-s+s^2\right)^2 \left(1+s+s^2\right)^2 \\
& = \frac{1}{4}(s^2 + 2s^3 + s^4)\frac{1}{9}(1+2 s^2+3 s^4+2 s^6+s^8) \\
& = g_{X_4}(s)g_{X_9}(s) \\
& = g_{X_4+X_9}
\end{align}
This shows that the distribution of $X+Y$ is the same as the distribution of $X_4+X_9$ where $X_4$ and $X_9$ represent the rolls of two independent die with faces $\{2,3,3,4\}$ and $\{0,2,2,4,4,4,6,6,8\}$, respectively.
\end{exmp}
\subsubsection{Generating arbitrarily distributed random variables}
Starting with a random variable $Y$ uniformly distributed on $[0,1]$ we can generate a random variable $X$ with arbitrary probability distribution function $f(x)$ as follows. Let
\begin{equation}
F(x) = \int_{-\infty}^x du \, f(u)
\end{equation}
be the cumulative distribution function for $X$. If $F(x)$ is continuous then the random variable $U \equiv F(X)$ is uniformly distributed on $[0,1]$ because
\begin{equation}
P(U \leq u) = P(F(X) \leq u) = P(X \leq F^{-1}(u)) = F\left(F^{-1}(u)\right) = u.
\end{equation}
The random variable $U$ is called the probability integral transform of $X$.
We therefore define $X$ by the equation $F(X) = U$ or
\begin{equation}
X = F^{-1}(U).
\end{equation}
In the case of the exponential distribution $f(x) = \lambda e^{-\lambda x}$ we have $F(x) = 1-e^{-\lambda x}$ and thus $X = -\frac{1}{\lambda}\log(1-Y)$.
\subsection{Correlation}
Suppose we want to simulate $n$ random variables with a given covariance matrix $\Sigma\in \mathbb{R}^{n \times n}$. This means we want a column vector of random variables $\vec{X} = (X_1,\ldots,X_n)^{\rm T}$ such that,
\begin{equation}
\Sigma = \mathbb{E} \big[(\vec{X}-\mathbb{E}[\vec{X}])(\vec{X}-\mathbb{E}[\vec{X}])^{\rm T} \big].
\end{equation}
Since the covariance matrix is positive semi-definite it can be diagonalized as $\Sigma = ADA^{\rm T}$ where $D$ is a diagonal matrix of eigenvalues and $A$ is an orthogonal matrix whose columns are the eigenvectors. If we take the columns of $A$ be be orthonormal then the eigenvalues in $D$ are the variances in the basis defined by the orthonormal column vectors; that is,
\begin{equation}
D = \diag(\sigma_1^2,\ldots,\sigma_n^2).
\end{equation}
We now construct an $n$-component column vector of $n$ independent random variables $\vec{Z} = (Z_1,\ldots,Z_n)^{\rm T}$ with $\mathbb{E}\big[\vec{Z}\big] =(\sigma_1^2,\ldots,\sigma_n^2)$ and $\mathbb{E}\big[\vec{Z}\big] = \vec{0}$. Then $D$ is obviously the covariance matrix for $\vec{Z}$,
\begin{equation}
D = \mathbb{E} \big[ \vec{Z} \vec{Z}^{\rm T} \big].
\end{equation}
Left-multiplying this expression by $A$ and right-multiplying by $A^{\rm T}$ we obtain the identity,
\begin{equation}
\Sigma = \mathbb{E} \big[ A\vec{Z} (A\vec{Z})^{\rm T} \big].
\end{equation}
Thus we have found a vector or random variables $A\vec{Z}$ with vanishing mean and covariance matrix given by $\Sigma$. It follows that the vector of random variables satisfying the original requirement is given by,
\begin{equation}
\vec{X} = A\vec{Z} + \mathbb{E}\big[\vec{X}\big].
\end{equation}
In the special case where $\vec{Z}$ is a vector of independent Gaussian RVs, the resulting RVs $\vec{X}$ will also be Gaussian.
\subsubsection{Copulas}
Suppose that $(X_1,\ldots X_m)$ are continuous random variables with marginal distributions $F_1(x_1)$ up to $F_m(x_m)$ and joint distribution function $F(x_1,\ldots,x_m)$. Define the random variables $(U_1,\ldots,U_m)$ to be the associated probability integral transforms $U_1 = F_1(X_1), \ldots,U_m=F(x_m)$. The copula of $(X_1,\ldots X_m)$ is defined as the joint distribution for $(U_1,\ldots,U_m)$,
\begin{equation}
C(u_1,\ldots,u_m) = P(U_1 \leq u_1,\ldots,U_m \leq u_m).
\end{equation}
Notice that $0 \leq u_i \leq 1$ so $C$ is a probability distribution on $[0,1]^m$ and moreover the marginal distributions $P(U_i \leq u_i)$ are all uniform on $[0,1]$.
Using the definition of the probability integral transform this equation can also be written as,
\begin{align}
C(u_1,\ldots,u_m)
& = P(F(X_1) \leq u_1,\ldots,F(X_m) \leq u_m), \\
& = P\big(X_1 \leq F_1^{-1}(u_1),\ldots,X_m\leq F_m^{-1}(u_m)\big), \\
& = F\big(F_1^{-1}(u_1),\ldots,F_m^{-1}(u_m)\big).
\end{align}
It immediately follows that,
\begin{equation}
F(x_1,\ldots,x_m) = C(F_1(x_1),\ldots,F_m(x_m)).
\end{equation}
Taking draws $(X_1,\ldots,X_m)$ from $F$ is thus equivalent to taking draws $(U_1,\ldots,U_m)$ from $C$ and solving the $m$ sets of equations,
\begin{equation}
X_i = F_i^{-1}(U_i), \quad \quad 1 \leq i \leq m.
\end{equation}
\subsection{Law of large numbers}
\subsubsection{Weak law of large numbers}
%Markov/Chebyshev inequality:
%\begin{equation}
% P(|X-\mu| \geq c) \leq \frac{\sigma^2}{c^2}, \quad \quad P(|X-\mu| \geq k\sigma) \leq \frac{1}{k^2}
%\end{equation}
%\begin{proof}
%\begin{align}
% \mathbb{E}[(X-\mu)^2]
% & = \int_{-\infty}^{\infty} dx \, (x-\mu)^2 f_X(x) \\
% & \geq \left(\int_{-\infty}^{\mu-c} + \int_{\mu+c}^{\infty}\right)dx \, (x-\mu)^2 f_X(x) \\
% & \geq c^2P(X \leq \mu - c)+c^2P(X \geq \mu + c) \\
% & = c^2P(|X-\mu|\geq c).
%\end{align}
%\end{proof}
Let $X_1,\ldots, X_n$ be independent, identically distributed random variables with mean $\mu$ and variance $\sigma^2$. Define the sample mean to be the random variable
\begin{equation}
M_n = \frac{S_n}{n}, \quad \quad S_n = X_1 + \cdots + X_n
\end{equation}
where $\mathbb{E}[M_n] = \mu$ and $\var[M_n] = \sigma^2/n$. Applying the Markov inequality to $M_n$ we obtain
\begin{equation}
P(|M_n - \mu| \geq \epsilon) \leq \frac{\sigma^2}{n\epsilon}.
\end{equation}
Taking $n \to \infty$ we find that for all $\epsilon$,
\begin{equation}
\lim_{n\to\infty} P(|M_n - \mu| \geq \epsilon) = 0.
\end{equation}
This implies that $M_n$ converges in probability to the true mean $\mu$.
\subsubsection{Central limit theorem}
Now consider the standardized sum which measures the number of standard deviations $S_n$ lies from its mean,
\begin{equation}
Z_n \equiv \frac{S_n - \mathbb{E}[S_n]}{\sigma_{S_n}} = \frac{S_n - n\mathbb{E}[X]}{\sqrt{n}\sigma} = \frac{S_n/n - \mu}{\sigma/\sqrt{n}}
\end{equation}
where $\mathbb{E}[Z_n]=0$ and $\var(Z_n) = 0$. In the limit as $n\to\infty$
\begin{equation}
P(Z_n \leq c) \to P(Z\leq c)
\end{equation}
where
\begin{equation}
f_Z(x) = \frac{1}{\sqrt{2\pi}}e^{-x^2/2}.
\end{equation}
\subsection{Stochastic processes}
\subsubsection{Markov process}
A discrete-time Markov chain is a stochastic process $\{ X_n \in S \; | \; 0 \leq n < \infty \}$ with the property that for all $n \geq 1$
\begin{equation}
P(X_n = j \; | \; X_{n-1} = i_{n-1},\ldots, X_0 = i_0) = P(X_n = s \; | \; X_{n-1} = i_{n-1}).
\end{equation}
A Markov chain can alternatively be characterized by the following equivalent condition that for all $n\geq 1$ and $m \geq 0$,
\begin{equation}
P(X_{n+m} = j \; | \; X_{n-1} = i_{n-1},\ldots, X_0 = i_0) = P(X_{n+m} = j \; | \; X_{n-1} = i_{n-1}).
\end{equation}
Let us show this for $m=1$ by defining the events $A = \{ X_{n+m} = j \}$, $B = \{ X_{n-1} = i_{n-1},\ldots, X_0 = i_0\}$ and $C_k = \{ X_{n-1} = k \}$. Then,
\begin{align}
P(A \; | \; B)
& = \frac{P(A \cap B)}{P(B)}, \\
& = \sum_{k \in S}\frac{P(A \cap B \cap C_k)}{P(B)}, \notag \\
& = \sum_{k \in S}\frac{P(A \cap B \cap C_k)}{P(B \cap C_k)}\frac{P(C_k \cap B)}{P(B)}, \notag \\
& = \sum_{k \in S} P(A \; | \; B, C_k) P(C_k \; | \; B).
\end{align}
If we now apply the Markov property we obtain,
\begin{equation}
P(A \; | \; B) = \sum_{k \in S} P(X_{n+1} = j \; | \; X_n = k, X_{n-1} = i_{n-1}) P(X_{n-1} = k \; | \; X_{n-1} = i_{n-1}),
\end{equation}
where we have left the event $\{ X_{n-1} = i_{n-1} \}$ in the conditioning for convenience. Using the definition of conditional probability we find that $P(X_n = k, X_{n-1} = i_{n-1})$ cancels and we are left with
\begin{align}
P(A \; | \; B)
& = \sum_{k \in S} \frac{P(X_{n+1} = j, X_n = k, X_{n-1} = i_{n-1})}{P(X_{n-1} = i_{n-1})}, \\
& = \frac{P(X_{n+1} = j, X_{n-1} = i_{n-1})}{P(X_{n-1} = i_{n-1})}, \\
& = P(X_{n+1} = j \; | \; X_{n-1} = i_{n-1}).
\end{align}
The $n$-step transition probabilities are defined by
\begin{equation}
p_{ij}(n) = P(X_n = j \; | \; X_0 = i),
\end{equation}
and the associated matrix is denoted $\mathbf{P}(n)$
For time-homogeneous Markov chains the $n$-step transition probabilities satisfy,
\begin{equation}
p_{ij}(m+n) = \sum_{k \in S} p_{ik}(m)p_{kj}(n) \iff \mathbf{P}(m+n) = \mathbf{P}(m) \mathbf{P}(n)
\end{equation}
It follows that $ \mathbf{P}(n) = \mathbf{P}(1)^n$.
\subsubsection{Branching process}
Consider a branching process with offspring probabilities $p_0$, $p_1$ and $p_2$. Denote the probability of extinction by $p$. We condition the probability of extinction on each of the possible outcomes of the first branching event; namely,
\begin{align}
P(\mathrm{extinction})
& = P(\mathrm{extinction}|0)P(0) + P(\mathrm{extinction}|1)P(1) + P(\mathrm{extinction}|2)P(2)
\end{align}
Using independence we can express the extinction probability in terms of itself as
\begin{align}
p
& = p_0\cdot 1 + p_1p + p_2 p^2.
\end{align}
Solving the quadratic equation and using the fact that $p_0+p_1+p_2=1$ we obtain $p = p_0/p_2$.
\subsubsection{Random walk}
Let $\{ X_n \; | \: 1 \leq n \leq \infty \}$ be a sequence iid random variables with probability distribution $P(X_n = \pm 1) = 1/2$. Let $S_0$ be a constant and $S_n = S_0+ \sum_{k=1}^n X_n$ for $n \geq 1$. Denote by $\phi_k$ the probability that $S_n$ reaches $A$ before $-B$ given that $S_0 = k$. Then we obtain the recursion relation
\begin{equation}
\phi_k = \frac{1}{2}\phi_{k-1} + \frac{1}{2}\phi_{k+1}.
\end{equation}
The boundary conditions for this finite difference equation are $\phi_A = 1$ and $\phi_{-B} = 0$. The solution for $\phi_0$ is
\begin{equation}
\phi_0 = \frac{B}{A+B}.
\end{equation}
Notice that the recursion relation can be rearranged to give $\phi_{k-1}+\phi_{k+1} -2\phi_k = 0$ which is just the discretization of the differential equation $\partial^2\phi$ = 0.
\subsubsection{Martingales}
Let $\{ X_n \; | \; 1 \leq n \leq \infty \}$ be a sequence of random variables. A discrete-time Martingale is a sequence of random variables $M_n = f(X_1,\ldots, X_n)$ ($0 \leq n \leq \infty$) with the property that
\begin{equation}
\mathbb{E}[M_n|X_1,\ldots,X_{n-1}] = M_{n-1}
\end{equation}
or equivalently
\begin{equation}
\mathbb{E}[M_n-M_{n-1}|X_1,\ldots,X_{n-1}] = 0.
\end{equation}
A Martingale in continuous time is defined as a stochastic process satisfying $\mathbb{E}_t[X(T)] = X(t)$ for $T > t$. Note that this implies that the expected increments are zero
\begin{equation}
\mathbb{E}_t[X(T) - X(t)] = 0.
\end{equation}
Martingales have the property that their expectation value is constant; that is if $0 \leq s \leq t$ then $\mathbb{E}[X(t)] = \mathbb{E}[X(s)]$. This follows from the tower law of condition expectations,
\begin{equation}
\mathbb{E}[X(t)] = \mathbb{E} \big[ \mathbb{E}[X(t) \; | \; \mathcal{F}_s] \big] = \mathbb{E}[X(s)].
\end{equation}
An important example of Martingale in continuous time is a Brownian motion. A Brownian motion is a continuous-time stochastic process which has continuous sample paths and satisfies
\begin{itemize}
\item
$w(0) = 0$
\item
The increments have distribution $w(T) - w(t) \sim \mathcal{N}(0,T-t)$ for $T>t$
\end{itemize}
Notice that the second criterion implies that the distribution of increments $w(T) - w(t)$ (for $T>t$) depends only on the length $T - t$ of the time interval and is hence independent of the previous history. A Brownian motion is a Martingale because
\begin{align}
\mathbb{E}_t[w(T)]
& = \mathbb{E}_t[w(T)-w(t)+w(t)] \\
& = \mathbb{E}_t[w(T)-w(t)] +\mathbb{E}_t[w(t)] \\
& = \mathbb{E}[w(T)-w(t)] + w(t) \\
& = w(t).
\end{align}
Moreover, since $\mathbb{E}[dw(t)] = 0$, we have $\mathbb{E}[dw(t)^2] = \var[dw(t)] = dt$.
The following Mathematica code generates and plots sample path for a Brownian motion
\begin{verbatim}
T = 10;
n = 1000;
dt = T/n;
dw = RandomVariate[NormalDistribution[0, Sqrt[dt]], n];
BrownianPath = Accumulate[Prepend[dw, 0]];
ListLinePlot[BrownianPath, Frame -> True]
\end{verbatim}
\begin{equation}
\includegraphics[width=70mm]{brownian}
\end{equation}
Another important example of a Martingale is the geometric Brownian motion which is a stochastic process given by $e^{- \sigma^2 /2 t + \sigma w(t)}$.
\subsubsection{Ito calculus}
A stochastic process $X$ is an Ito process if
\begin{equation}
dX(t) = \mu(t) dt + \sigma(t) dw(t)
\end{equation}
where $\mu(t)$ and $\sigma(t)$ are stochastic processes. An Ito process is called a diffusion if $\mu(X(t),t)$ and $\sigma(X(t),t)$. In the special case where $\mu$ and $\sigma$ are constants this equation can be integrated to
\begin{equation}
X(t) = X(0) + \mu t + \sigma w(t).
\end{equation}
Thus
\begin{equation}
X(T) - X(t) \sim \mathcal{N} \left(X(0) + \mu (T-t), \sigma^2 (T-t)\right).
\end{equation}
Ito's lemma states that if $X(t)$ is an Ito process with drift $\mu(t)$ and volatility $\sigma(t)$ and $Y(t) = f(X(t),t)$ then
\begin{align}
dY(t)
& = f_X(X(t),t)dX(t) + f_t(X(t),t)dt + \frac{1}{2}f_{XX}(X(t),t)\sigma(t)^2 dt \\
& = \left[f_X(X(t),t)\mu(t) + f_t(X(t),t) + \frac{1}{2}f_{XX}(X(t),t)\sigma(t)^2 \right] dt + f_X(X(t),t) \sigma(t) dw(t) \notag.
\end{align}
The intuition for the second derivative term follows from considering the squared differential
\begin{equation}
dX(t)^2 = \mu(t)^2dt^2+2\mu(t)\sigma(t)dw(t)dt+\sigma(t)^2dw(t)^2.
\end{equation}
We then argue that the first and second terms are negligible compared to the third because of the identity $\mathbb{E}[dw(t)^2]=dt$.
Ito's lemma can be used to solve stochastic differential equations by changing variables. Let us rescale the drift and volatility,
\begin{equation}
dX(t) = X(t) \mu(t) dt + X(t) \sigma(t) dw(t)
\end{equation}
where $\mu(t)$ and $\sigma(t)$ now denote the relative drift and volatility, respectively. Consider the change of variables $Y(t) = f(X(t))= \log X(t)$. Then $\mu(t) \to X(t) \mu(t)$, $\sigma(t) \to X(t)\sigma(t)$, $f_X = 1/X$ and $f_{XX} = - 1/X^2$ so by Ito's lemma
\begin{equation}
dY(t) = \left[\mu(t) - \frac{1}{2}\sigma(t)^2\right]dt + \sigma(t) dw(t)\label{e:LogIto}
\end{equation}
so
\begin{equation}
\log X(t) = \log X(0) + \int_0^t \left[\mu(s) - \frac{1}{2}\sigma(s)^2\right]ds + \int_0^t \sigma(s) dw(s)
\end{equation}
Now assume that $\mu(t)$ and $\sigma(t)$ are time-dependent but not stochastic. Computing the expectation value and variance we find
\begin{align}
\mathbb{E}[\log X(t)]
& = \log X(0) + \int_0^t \left[\mu(s) - \frac{1}{2}\sigma(s)^2\right]ds \\
\var[\log X(t)]
& = \var\left[\int_0^t \sigma(s) dw(s)\right] \\
& = \int_0^t \sigma(s)^2\var[ dw(s)] \\
& = \int_0^t \sigma(s)^2 ds
\end{align}
and thus
\begin{equation}
\log X(t) \sim \mathcal{N} \left(\log X(0) + \int_0^t \left[\mu(s) - \frac{1}{2}\sigma(s)^2\right]ds,\int_0^t \sigma(s)^2 ds\right).
\end{equation}
Taking $\mu$ and $\sigma$ to be constants (geometric Brownian motion) we obtain
\begin{equation}
X(t) = X(0)e^{\left(\mu - \frac{1}{2}\sigma^2\right)t + \sigma w(t)}
\end{equation}
Hence
\begin{equation}
\log X(t) \sim \mathcal{N}\left(\log X(0) + \left(\mu - \frac{1}{2}\sigma^2\right)t,\sigma^2t\right)
\end{equation}
The following Mathematica code generates and plots a sample path for a geometric Brownian motion
\begin{verbatim}
T=10;n=1000;dt=T/n;mu=0.3;sigma=0.4;Y0=2;
dw=RandomVariate[NormalDistribution[0,Sqrt[dt]],n];
dY=(mu-sigma^2/2)dt+sigma dw;
YPath=Accumulate[Prepend[dY,Y0]];
XPath=Exp[YPath];
ListLinePlot[XPath,Frame->True]
\end{verbatim}
\begin{equation}
\includegraphics[width=70mm]{geometricbrownian}
\end{equation}
Suppose that $X$ is an Ito process under some probability measure $P$,
\begin{equation}
dX(t) = X(t)\mu(t) dt + X(t)\sigma(t) dw(t)
\end{equation}
where $w$ is a Brownian motion under $P$. Given another probability measure $Q$, Girsanov's theorem states that $X$ is an Ito process under $Q$ with the same volatility,
\begin{equation}
dX(t) = X(t) \hat{\mu}(t)dt + X(t)\sigma(t)d\hat{w}(t)
\end{equation}
where $\hat{w}$ is a Brownian motion under $Q$.
\begin{exmp}(Ornstein-Uhlenbeck process)
Consider the SDE
\begin{equation}
dr(t)
= \kappa(\theta(t)-r(t))dt+\sigma dw(t).
\end{equation}
We can rewrite this as
\begin{align}
dr(t) + \kappa r(t) dt
& = \kappa\theta(t)dt + \sigma dw(t) \\
e^{-\kappa t}d(e^{\kappa t}r(t))
& = \kappa\theta(t)dt + \sigma dw(t) \\
d(e^{\kappa t}r(t))
& = \kappa e^{\kappa t}\theta(t)dt + \sigma e^{\kappa t} dw(t)
\end{align}
Integrating both sides gives
\begin{equation}
r(t) = e^{-\kappa t}r(0) + \kappa \int_0^t e^{-\kappa(t-s)}\theta(s)ds + \sigma \int_{0}^t e^{-\kappa(t-s)} dw(s)
\end{equation}
It follows that
\begin{equation}
r(t) \sim \mathcal{N}\left(e^{-\kappa t}r(0) + \kappa \int_0^t e^{-\kappa(t-s)}\theta(s)ds,\frac{\sigma^2}{2\kappa}(1-e^{-2\kappa t})\right)
\end{equation}
If $\theta(s) = \theta$ then
\begin{equation}
r(t) = e^{-\kappa t}r(0) + \theta(1-e^{-\kappa t}) + \sigma \int_{0}^t e^{-\kappa(t-s)} dw(s)
\end{equation}
and the process is mean reverting to $\theta$.
\end{exmp}
\begin{exmp}
Consider the stochastic process
\begin{equation}
X(t) = \int_0^t ds \, w(s).
\end{equation}
This process is an Ito process with Brownian drift term $dX(t) = w(t)dt$. $X(t)$ is a normal random variable. Let us calculate its mean and variance. If we integrate by parts we find
\begin{equation}
X(t) = t \, w(t)-\int_0^t s \, dw(s) = \int_0^t(t-s)dw(s)
\end{equation}
where we have used $w(0)=0$. Thus $\mathbb{E}[X(t)]=0$ and $\var[X(t)] = \int_0^t(t-s)^2ds = t^3/3 $. We can also compute the variance of $X(t)$ using the fact that it has vanishing expectation value,
\begin{align}
\var[X(t)]
& = \mathbb{E}\left[X(t)^2\right] \\
& = \int_0^t ds_1\int_0^t ds_2 \, \mathbb{E}[X(s_1)X(s_2)] \\
& = \int_0^t ds_1\int_0^t ds_2 \min(s_1,s_2) \\
& = t^3 /3.
\end{align}
Here we have used the fact that $X(t)$ has independent stationary increments so
\begin{equation}
\mathbb{E}[X(s_1)X(s_2)]
=
\begin{cases}
\mathbb{E}\left[X(s_1)^2\right]=s_1, & s_2 > s_1 \\
\mathbb{E}\left[X(s_2)^2\right]=s_2, & s_1 > s_2.
\end{cases}
\end{equation}
\end{exmp}
\subsubsection{Semimartingales}
A Poisson process $N(t)$ with intensity $\lambda$ is a continuous-time stochastic process such that
\begin{itemize}
\item $N(0) = 0$
\item $dN(t)$ is a Bernoulli random variable for all $t$
\item $\mathbb{E}_t[N(T) - N(t)] = \lambda(T-t)$ for all $T \geq t$.
\end{itemize}
A compound Poisson (jump) process with intensity $\lambda$ and jump size distribution $\mathcal{D}$ is given by
\begin{equation}
J(t) = \sum_{i=1}^{N(t)} J_i
\end{equation}
where $J_i$ are independent, identically distributed random variables under $\mathcal{D}$ which are independent of $N$.
A semimartingale is the sum of an Ito process and a jump process
\begin{equation}
dX(t) = \mu(t) dt + \sigma(t) dw(t) + dJ(t).
\end{equation}
The following Mathematica code generates a sample path for a the process $dX(t) = dw(t) + dJ(t)$ where $dJ(t)$ is a jump process with intensity $\lambda = 1$ and $\mathcal{D} = \mathcal{N}(0,3)$.
\begin{verbatim}
T = 10; n = 2000; dt = T/n; lambda = 1; sigma = 2;
dw = RandomVariate[NormalDistribution[0, Sqrt[dt]], n];
dJ = Table[
If[RandomVariate[BernoulliDistribution[lambda dt]] == 0, 0,
RandomVariate[NormalDistribution[0, sigma]]], {i, 1, n}];
dX = dw + dJ;
BrownianJumpPath = Accumulate[Prepend[dX, 0]];
ListLinePlot[BrownianJumpPath, Frame -> True]
\end{verbatim}
\begin{equation}
\includegraphics[width=70mm]{brownianjump}
\end{equation}
Ito's lemma can be generalized to semimartingales. If $X(t)$ is a semimartingale and $Y(t) = f(X(t),t)$ then
\begin{align}
dY(t)
& = \left[f_X(X(t-),t)\mu(t) + f_t(X(t-),t) + \frac{1}{2}f_{XX}(X(t-),t)\sigma(t)^2 \right] dt + \notag \\
& \quad + f_X(X(t-),t) \sigma(t) dw(t) + f(X(t),t)-f(X(t-),t)
\end{align}
\subsection{Feynman-Kac formula}
Consider the following parabolic PDE ($x \in \mathbb{R}$ and $t < T$)
\begin{equation}
\frac{\partial u}{\partial t} + \frac{1}{2}\sigma(x,t)^2\frac{\partial^2 u}{\partial x^2} + \mu(x,t)\frac{\partial u}{\partial x} - V(x,t)u(x,t) = 0, \quad \quad u(x,T) = f(x).
\end{equation}
Now consider the diffusion process
\begin{equation}
dX(t) = \mu(X(t),t)dt + \sigma(X(t),t)dw(t)
\end{equation}
where $w$ is a Brownian motion under $Q$. The Feynman-Kac theorem relates the solution of the PDE to a conditional expectation value
\begin{equation}
u(x,t) = \mathbb{E}^Q \left(\left. e^{-\int_t^T ds \, V(X(s),s)}f(X(T)) \right| X(t) = x \right).
\end{equation}
The Feynman-Kac formula has a simple multi-dimensional generalization to PDEs dependent on $p$-dimensional spatial variables,
\begin{equation}
\frac{\partial u}{\partial t} + \sum_{i=1}^p \mu_i(x,t) \frac{\partial u}{\partial x_i} + \frac{1}{2}\sum_{i=1}^p\sum_{j=1}^p s_{ij}(x,t) \frac{\partial^2 u}{\partial x_i \partial x_j} - V(x,t)u(x,t) = 0,
\end{equation}
where $s_{ii}(x,t) \geq 0$ and $s_{ij}(x,t) = s_{ji}(x,t).$
The solution to this equation for $t < T$ subject to the terminal condition $u(x,T) = f(x)$ is given by the expectation
\begin{equation}
u(x,t) = \mathbb{E}^Q \left(\left. e^{-\int_t^T ds \, V(X(s),s)}f(X(T)) \right| X(t) = x \right)
\end{equation}
where $X_i$ are Ito processes given by
\begin{equation}
dX_i(t) = \mu_i(X(t),t) dt + \vec{\sigma}_i(X(t),t) \cdot d\vec{w}(t)
\end{equation}
where $\vec{w}$ is a $d$-dimensional Brownian motion and $s_{ij} = \vec{\sigma}_i \cdot \vec{\sigma}_j$.
\begin{itemize}
\item
In the special case where $\sigma(x,t) = \sigma$ and $\mu(x,t) = 0$ the PDE reduces to the backward heat equation
\item
The expectation value can be interpreted as an integration with respect to the Wiener measure
\item
The probabilistic interpretation of absence of mixed derivative terms is that the stochastic increments are independent
\end{itemize}
\section{Optimal control}
\subsection{Liquidation problem}
Consider the problem of selling $X$ shares of an asset during a time interval $[0,T]$. We assume that $n_j$ shares are sold during the interval $(t_{j-1},t_j]$ where $t_j = j \tau$, $\tau = T/N$ and $j \in \{ 1, \ldots, N \}$ such that $X = \sum_{j=1}^N n_j$. The price received $\hat{S}_j$ from the sale of $n_j$ shares during the interval $(t_{j-1},t_j]$ is assumed to be displaced from the prevailing market price $S_{j-1}$ by a temporary price impact which we model as function dependent only on $n_j$,
\begin{equation}
\hat{S}_j = S_{j-1} - h(n_j).
\end{equation}
The price process is modeled as a discrete-time random process which suffers a permanent impact $- g(n_j)$ dependent on the number of trades executed during that interval,
\begin{equation}
S_{j} = S_{j-1} + \sigma \tau^{1/2} z_{j} - g(n_{j}),
\end{equation}
where $(z_1,\ldots,z_N)$ is a vector of independent and identically distributed $\mathcal{N}(0,1)$ random variables.
The recursive formula for the price process can be used to express it in terms of today's spot price $S_0$ as follows,
\begin{equation}
S_k = S_0 + \sum_{j=1}^k \big[ \sigma \tau^{1/2} z_j - g(n_j) \big].
\end{equation}
Hence the total revenue generated from the trading strategy is given by,
\begin{align}
R
& = \sum_{k=1}^N \hat{S}_k n_k, \\
& = S_0 \sum_{k=1}^N n_k + \sigma \tau^{1/2} \sum_{k=1}^N \sum_{j=1}^{k-1} z_j n_k - \sum_{k=1}^N\sum_{j=1}^{k-1} g(n_j)n_k - \sum_{k=1}^N h(n_k)n_k, \\
& = S_0 X+ \sigma \tau^{1/2} \sum_{k=1}^N z_k x_k - \sum_{k=1}^N g(n_k)x_k - \sum_{k=1}^N h(n_k)n_k,
\end{align}
where we have defined the inventory process $x_k$ which satisfies $x_0 = X$ and $x_N = 0$,
\begin{equation}
x_k = X-\sum_{j=1}^k n_k.
\end{equation}
The expected value and variance of the total cost of trading $C = S_0 X - R$ is given by
\begin{align}
\mathbb{E}[C]
& = \sum_{k=1}^N \big[g(n_k) x_k + h(n_k) n_k \big], \\
& = \sum_{k=1}^N \big[g(x_{k-1}-x_k) x_k + h(x_{k-1}-x_k) (x_{k-1}-x_k) \big], \\
\var (C)
& = \sigma^2 \tau \sum_{k=1}^N x_k^2.
\end{align}
Assuming linear price impact functions $g(x) = (c/\tau) x$ and $h(x) = (\eta/\tau) x$ we obtain,
\begin{equation}
C(x)
= \frac{cX^2}{2} + \sum_{k=1}^N \tau \, \frac{1}{2}m\left(\frac{x_{k-1}-x_k}{\tau} \right)^2 - \sum_{k=1}^N \sigma \tau^{1/2} z_k x_k ,
\end{equation}
where $m = 2\eta - c$. Let us define a mean-variance objective function as follows,
\begin{align}
S[x_1,\ldots,x_{N-1}]
& = \mathbb{E}[C] + \lambda \var(C) - \frac{cX^2}{2}, \\
& = \sum_{k=1}^N \tau \left[ \frac{1}{2}m\left(\frac{x_{k-1}-x_k}{\tau} \right)^2 + \frac{1}{2}\omega^2 x_k^2\right],
\end{align}
where we have defined $\omega^2 = 2 \lambda \sigma^2$. Optimizing the objective function by setting $\partial S / \partial x_i = 0$ we obtain,
\begin{equation}
\frac{x_{i+1}-2x_i + x_{i-1}}{\tau^2} = \omega^2 x_i.
\end{equation}
In the limit $\tau \to 0$ the objective function takes the form of an action functional for a particle of mass $m$ in an inverted harmonic potential $V(x) = -\frac{1}{2}\omega^2 x^2$,
\begin{equation}
S[x] = \int_0^T dt \left[\frac{1}{2}m \dot{x}^2 +\frac{1}{2}\omega^2 x^2 \right].
\end{equation}
The solution of the corresponding Euler-Lagrange equation $m\ddot{x} = \omega^2 x$ subject to the boundary conditions $x(0) = X$ and $x(T) = 0$ is thus,
\begin{equation}
x(t) = X\frac{\sinh\big[\frac{\omega}{\sqrt{m}}(T-t)\big]}{\sinh\big[\frac{\omega}{\sqrt{m}}T\big]}.
\end{equation}
Setting $t = k \tau$ we obtain the approximate solution of the finite-difference equation found by Almgren and Chriss \cite{almgren}.
\subsubsection{Dynamic strategies}
In this section we review \cite{sepin} which considers the liquidation problem with temporary and permanent price impact functions of the form,
\begin{align}
\hat{S}_j
& = S_{j-1} - \eta_j n_j, \\
S_j
& = S_{j-1} + \tau \sigma_j z_j - c n_j
\end{align}
where $\{ \eta_j \}_{j = 0}^N$ and $\{ \sigma_j \}_{j \in \mathbb{N}}$ are Markov chains with transition probabilities,
\begin{equation}
p_{k-1}^{(v,w)} = \mathbb{Q}[(\sigma_k,\eta_k) = w \; | \; (\sigma_{k-1},\eta_{k-1})=v], \quad k \in \{1,\ldots,N \}.
\end{equation}
The total cost of trading is now given by
\begin{equation}
C(x) = \frac{cX^2}{2} + \sum_{k=1}^N \left[ \left(\eta_k - \frac{c}{2}\right)(x_{k-1}-x_k)^2 - \sigma_k \tau^{1/2} z_k x_k \right].
\end{equation}
In contrast to Almgren and Chriss, the risk-aversion parameter is taken to be zero so the optimization problem is,
\begin{equation}
x^\ast = \argmin_{x \in \mathcal{A}} \mathbb{E}\big[ Q(x) \; | \; (\sigma_0,\eta_0) = v \big],
\end{equation}
where for convenience we wave defined $Q(x) = C(x) - cX^2/2$ and $\mathcal{A}$ denotes the set of admissible strategies $(x_0,\ldots,x_N)$ with the boundary conditions $x_0 = X$ and $x_N = 0$.
Let us define
\begin{equation}
J_n^{(v)}(z) = \min_{x \in \mathcal{A}_n(z)} \mathbb{E}\big[ Q_n(x) \; | \; (\sigma_n,\eta_n) = v\big] ,
\end{equation}
where $\mathcal{A}_n(z)$ denotes the set of admissible substrategies $(x_n,\ldots,x_N)$ satisfying the boundary conditions $x_n = z$ and $x_N = 0$, and we have defined
\begin{equation}
Q_n(x) = \sum_{i=n+1}^N (x_{i-1}-x_i)^2\tilde{\eta}_i, \quad \quad \tilde{\eta}_i = \eta_i - \frac{c}{2}.
\end{equation}
The inductive hypothesis is,
\begin{equation}
J_{n}^{(v)}(x_n) = x_{n}^2 a_{n}^{(v)}, \quad n \in \{ 0 ,\ldots , N-1 \}.
\end{equation}
Let us check this in the case $n = N-1$. Using the fact that $x_N = 0$ we obtain $Q_{N-1}(x) = (x_{N-1}^2)\tilde{\eta}_N$ and thus
\begin{align}
J^{(v)}_{N-1}(x_{N-1})
& = \min_{\mathcal{A}_{N-1}(x_{N-1})} (x_{N-1})^2 \mathbb{E}[\tilde{\eta}_N \; | \; (\sigma_{N-1},\eta_{N-1})=v], \\
& = (x_{N-1})^2 \mathbb{E}[\tilde{\eta}_N \; | \; (\sigma_{N-1},\eta_{N-1})=v].
\end{align}
We therefore obtain,
\begin{equation}
a^{(v)}_{N-1} = \mathbb{E}[\tilde{\eta}_N \; | \; (\sigma_{N-1},\eta_{N-1})=v].
\end{equation}
Now,
\begin{equation}
Q_{n-1}(x) = Q_n + (x_{n-1} - x_n)^2\tilde{\eta}_{n}
\end{equation}
so,
\begin{align}
J^{(v)}_{n-1}(x_{n-1})
& = \min_{\mathcal{A}_{n-1}(x_{n-1})} \mathbb{E} [Q_{n-1} \; | \; (\sigma_{n-1},\eta_{n-1}) = v], \notag \\
& = \min_{\mathcal{A}_{n-1}(x_{n-1})} \big\{\mathbb{E} [Q_{n} \; | \; (\sigma_{n-1},\eta_{n-1}) = v]+ (x_{n-1} -x_{n})^2 \mathbb{E} [\tilde{\eta}_n \; | \; (\sigma_{n-1},\eta_{n-1}) = v]\big\}, \notag \\
& = \min_{x_n} \Big\{\min_{\mathcal{A}_{n}(x)}\mathbb{E} [Q_{n} \; | \; (\sigma_{n-1},\eta_{n-1}) = v]+ (x_{n-1} -x_{n})^2 \mathbb{E} [\tilde{\eta}_n \; | \; (\sigma_{n-1},\eta_{n-1}) = v]\Big\}. \end{align}
Using the tower rule for conditional expectations we obtain,
\begin{align}
\min_{\mathcal{A}_{n}(x)}\mathbb{E} [Q_{n} \; | \; (\sigma_{n-1},\eta_{n-1}) = v]
& = \min_{\mathcal{A}_{n}(x_n)} \mathbb{E} \big[ \mathbb{E} [Q_{n} \; | \; (\sigma_n,\eta_n)] \; | \; (\sigma_{n-1},\eta_{n-1}) = v \big ], \notag \\
& = \min_{\mathcal{A}_{n}(x_n)} \sum_{w} \mathbb{E} [Q_{n} \; | \; (\sigma_{n},\eta_{n}) = w]\mathbb{Q}[(\sigma_n,\eta_n) = w \; | \; (\sigma_{n-1},\eta_{n-1})=v], \notag \\
& = \sum_{w} \Big[\min_{\mathcal{A}_{n}(x_n)}\mathbb{E} [Q_{n} \; | \; (\sigma_{n},\eta_{n}) = w] \Big] \mathbb{Q}[(\sigma_n,\eta_n) = w \; | \; (\sigma_{n-1},\eta_{n-1})=v], \notag \\
& = x_n^2 \sum_{w} p_{n-1}^{(v,w)} a_n^{(w)},
\end{align}
where we have used the inductive hypothesis $\min_{\mathcal{A}_{n}(x_n)} \mathbb{E} [Q_{n} \; | \; (\sigma_{n},\eta_{n}) = w] = x_n^2 a_n^{(w)}$. Substituting back we obtain,
\begin{align}
J^{(v)}_{n-1}(x_{n-1})
& = \min_{x_n} \bigg\{ x_n^2\sum_{w}p_{n-1}^{(v,w)} a_n^{(w)}+ (x_{n-1} -x_{n})^2 \mathbb{E} [\tilde{\eta}_n \; | \; (\sigma_{n-1},\eta_{n-1}) = v]\bigg\}.
\end{align}
So $x^\ast$ satisfies the recursion,
\begin{equation}
x_n^\ast = x_{n-1}^\ast \frac{\mathbb{E} [\tilde{\eta}_n \; | \; (\sigma_{n-1},\eta_{n-1}) = v]}{\mathbb{E} [\tilde{\eta}_n \; | \; (\sigma_{n-1},\eta_{n-1}) = v]+\sum_{w}p_{n-1}^{(v,w)} a_n^{(w)}}.
\end{equation}
Substituting back into the value function we see that the inductive hypothesis is satisfied by the optimal solution.
\subsection{Infinite horizon}
In this section we consider optimal trading of a single asset with an infinite time horion \cite{sepin2}. Let $S_n$ denote the price of an asset at time $t_n$ and let $r$ denote the risk-free rate. Suppose the excess returns $r_n = S_n - (1+r)S_{n-1}$ from time $t_{n-1}$ to time $t_n$ are described in terms of a $K$-dimensional vector of stochastic factors $\vec{f}_n$ by the following linear hypothesis,
\begin{align}
r_n
& = \vec{\theta} \cdot \vec{f}_{n} + u_n,
\end{align}
where $u_n$ is a random variable with mean zero and variance $\sigma^2$. The factors are assumed to evolve according to a mean-reverting process,
\begin{align}
\vec{f}_n
& = \vec{f}_{n-1} + \kappa(\vec{m}_n - \vec{f}_{n-1}) + \vec{\xi}_n,
\end{align}
where $\vec{\xi}_n$ is sequence of iid mean zero random vectors with covariance matrix $\Omega$, $\vec{m}_n$ is a vector-valued stochastic process representing the mean-reversion values and $\kappa \in \mathbb{R}^{K\times K}$ is a matrix which describes the speed of mean reversion of the stochastic factor vector $\vec{f}_n$. Let $x_n$ denote the amount of holdings in the asset at time $t_n$. The cost of trading $x_{n}-x_{n-1}$ shares over the interval $(t_{n-1},t_n]$ is given by,
\begin{equation}
C_n = (x_{n}-x_{n-1}) \hat{S}_n,
\end{equation}
where $\hat{S}_n$ is the transacted cost which differs from the spot price $S_{n-1}$ by a term proportional to the number of trades executed $x_n-x_{n-1}$, multiplied by a stochastic liquidity $\eta_n$,
\begin{equation}
\hat{S}_n = S_{n-1} + \eta_n (x_n - x_{n-1}).
\end{equation}
The stochastic process $(\vec{m}_n,\eta_n)$ is taken to be a time-homogeneous Markov chain with transition probabilities,
\begin{equation}
p^{(v,w)} = \mathbb{Q}[(\vec{m}_k,\eta_k) = w \; | \; (\vec{m}_{k-1},\eta_{k-1})=v].
\end{equation}
The optimal trading strategy is such that it maximizes the expected returns minus transaction costs. Following Markowitz, it is instructive to penalize the risk of the position by introducing a risk-aversion coefficient $\lambda$. The required strategy as a function of the initial state $v$ is thus,
\begin{equation}
x^\ast = \argmax_{x \in \mathcal{A}} \mathbb{E}\big[ Q(x) \; | \; (\vec{m}_0,\eta_0) = v\big],
\end{equation}
where
\begin{equation}
Q(x) = \sum_{k\geq 1} r_k x_k - \eta_k(x_k-x_{k-1})^2 - \frac{1}{2} x_k^2 \gamma \sigma^2.
\end{equation}
and $\mathcal{A}$ denotes the set of strategies with $x_0 = X$ and $f_0 = f$. Let us define,
\begin{equation}
J_{n}^{(v)}(z,f) = \max_{x \in \mathcal{A}_n(z,f)} \mathbb{E}\big[ Q_n(x) \; | \; (\vec{m}_n,\eta_n) = v\big], \quad \quad n \in \{0,1,\ldots,\infty \},
\end{equation}
where
\begin{equation}
Q_n(x) = \sum_{k\geq n+1} r_k x_k - \eta_k(x_k-x_{k-1})^2 - \frac{1}{2} x_k^2 \gamma \sigma^2.
\end{equation}
We are interested in the case $n = 0$ but since the problem is time-homogeneous we choose a time-homogeneous ansatz. Following \cite{garleanu}, we choose the following quadratic ansatz,
\begin{equation}
J^{(v)}(z,\vec{f}) = - \frac{1}{2} a_v z^2 + b_v z + \vec{f} \cdot A_v \vec{f} + \vec{c}_v \cdot \vec{f} + \vec{d}_v \cdot \vec{f} z + e_v.
\end{equation}
Proceeding inductively we find,
\begin{align}
J_{n-1}^{(v)}(x_{n-1},\vec{f}_{n-1})
& = \max_{x \in \mathcal{A}_{n-1}(x_{n-1},f_{n-1})} \mathbb{E}\big[ Q_{n-1}(x) \; | \; (\vec{m}_{n-1},\eta_{n-1}) = v\big], \\
& = \max_{x \in \mathcal{A}_{n-1}(x_{n-1},f_{n-1})} \Big\{ \mathbb{E}\big[ Q_{n}(x) \; | \; (\vec{m}_{n-1},\eta_{n-1}) = v\big] + \notag \\
& \quad + \mathbb{E}\big[ r_n \; | \; (\vec{m}_{n-1},\eta_{n-1}) = v\big] x_n - \frac{1}{2} x_n^2 \gamma\sigma^2 - (x_n - x_{n-1})^2 \mathbb{E}\big[ \eta_n \; | \; (\vec{m}_{n-1},\eta_{n-1}) = v\Big] \Big\} \notag.
\end{align}
Using the tower rule for conditional expectations we obtain,
\begin{align}
\mathbb{E}\big[ Q_{n}(x) \; | \; (\vec{m}_{n-1},\eta_{n-1}) = v\big]
& = \mathbb{E}\big[ Q_{n}(x) \; | \; (\vec{m}_{n-1},\eta_{n-1}) = v\big], \\
& = \mathbb{E} \big[ \mathbb{E} [Q_{n} \; | \; (\vec{m}_n,\eta_n) ] \; | \; (\vec{m}_{n-1},\eta_{n-1}) = v \big ], \\
& = \sum_w p^{(v,w)} \mathbb{E} [Q_{n} \; | \; (\vec{m}_n,\eta_n) = w].
\end{align}
Now recall that in our optimization problem $\vec{f}_{n-1}$ is known. On the other hand, $\vec{f}_n$ is uncertain because it depends on the random variables $\vec{\xi}_n$. Therefore we condition on the value of $\vec{\xi}_n$ as follows,
\begin{equation}
\mathbb{E}\big[ Q_{n}(x) \; | \; (\vec{m}_{n-1},\eta_{n-1}) = w\big]
= \sum_w p^{(v,w)} \int d\vec{\xi} \, \rho(\vec{\xi})\mathbb{E} \big[ Q_{n} \; | \; (\vec{m}_n,\eta_n) = w,\;\vec{\xi}_n = \vec{\xi} \big].
\end{equation}
Since $\vec{\xi}_n$ is known in the above expectation, and using the fact that
\begin{align}
\mathbb{E}[u_n \; | \; (\vec{m}_{n-1},\eta_{n-1}) = v]
& = 0, \\
\mathbb{E}[\xi_n \; | \; (\vec{m}_{n-1},\eta_{n-1}) = v]
& = 0,
\end{align}
we obtain,
\begin{align}
J_{n-1}^{(v)}(x_{n-1},f_{n-1})
& = \max_{x_n} \Big\{ \sum_w p^{(v,w)} \int d\vec{\xi} \, \rho(\vec{\xi}) \left.J^{(w)}(x_n,\vec{f}_n)\right|_{\vec{f}_n = \vec{f}_{n-1}+\kappa (\vec{m}_{(w)}-\vec{f}_{n-1}) + \vec{\xi}} + \\
& \quad + \vec\theta \cdot \Big[(1-\kappa)\vec{f}_{n-1} + \sum_w p^{(v,w)}\kappa \vec{m}_{(w)} \Big]x_n - \frac{1}{2} x_n^2 \gamma\sigma^2 - (x_n - x_{n-1})^2 \sum_w p^{(v,w)}\eta_{(w)} \Big\}.\notag
\end{align}
Integrating over the random vector $\vec{\xi}$ we obtain,
\begin{align}
& \int d\vec{\xi} \, \rho(\vec{\xi}) \left.J^{(w)}(x_n,\vec{f}_n)\right|_{\vec{f}_n = \vec{f}_{n-1}+\kappa (\vec{m}_{(w)}-\vec{f}_{n-1}) + \vec{\xi}}
= -\frac{1}{2}a_w x_n^2 + b_w x_n + \vec{d}_w \cdot \big[ \vec{f}_{n-1}+\kappa (\vec{m}_{(w)}-\vec{f}_{n-1}) \big] x_n \notag \\
& + \big[ \vec{f}_{n-1}+\kappa (\vec{m}_{(w)}-\vec{f}_{n-1}) \big] \cdot A_w \big[ \vec{f}_{n-1}+\kappa (\vec{m}_{(w)}-\vec{f}_{n-1}) \big] + \tr(A_w \Omega) + \vec{c}_w\big[ \vec{f}_{n-1}+\kappa (\vec{m}_{(w)}-\vec{f}_{n-1}) \big] + e_w. \notag
\end{align}
Substituting back and collecting terms in $x_n$ gives,
\begin{align}
J_{n-1}^{(v)}(x_{n-1},f_{n-1})
& = \max_{x_n} \sum_w p^{(v,w)}\Big\{ -\frac{1}{2}(\gamma\sigma^2 + a_w + 2\eta_{(w)})x_n^2 - x_{n-1}^2 \eta_{(w)} + \notag \\
& \quad + x_n\Big(\vec\theta \cdot \Big[(1-\kappa)\vec{f}_{n-1} +\kappa \vec{m}_{(w)} \Big]x_n+ 2x_{n-1}\eta_{(w)} + b_w + \vec{d}_w \cdot \big[(1-\kappa)\vec{f}_{n-1}+\kappa \vec{m}_{(w)}\big]\Big) + \notag \\
& \quad + \big[ (1-\kappa)\vec{f}_{n-1}+\kappa \vec{m}_{(w)} \big] \cdot A_w \big[ (1-\kappa)\vec{f}_{n-1}+\kappa \vec{m}_{(w)} \big] + \tr(A_w \Omega) + \notag \\