-
Notifications
You must be signed in to change notification settings - Fork 9
/
section4.tex
35 lines (31 loc) · 2.32 KB
/
section4.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
\section*{Numerical Est. Techinques}
\textbf{Setting}: Estimate$\hat{f}(x) \in \mathcal{F}$ with minimal prediction error.
\subsection*{K-Fold Cross Validation}
Initialisation (split training set):\\
$\mathcal{Z}=\mathcal{Z}_1\bigcup\mathcal{Z}_2\bigcup\cdots\bigcup\mathcal{Z}_K, \mathcal{Z}_\mu \bigcap\mathcal{Z}_\nu = \emptyset $
% with map $\kappa:\{1,\cdots, n\} \rightarrow\{, \cdots, K\}$
$|\mathcal{Z}_k|\approx n\frac{K-1}{K}$ \# of samples\\
Learning:\\
$\hat{f}^{-\nu}(x){=}
\argmin_{f\in\mathcal{F}}\frac{\sum_{i\not\in\mathcal{Z}_\nu}(y_i-f(x_i))^2}{|\mathcal{Z}-\mathcal{Z}_{\nu}|}$\\
Validation:\\
$\hat{R}^{cv} = \frac{1}{n}\sum_{i\leq n}(y_i-\hat{f}^{-\kappa(i)}(x_i))^2$\\
Underfits because smaller dataset.\\
\textbf{Leave-one-out:} $K=n$ (unbiased but var can be large from corr. datasets)
\subsection*{Bootstrapping}
Bootstrap samples: $\mathcal{Z}^*=\{\mathcal{Z}_1^*, \cdots\mathcal{Z}_B^*\}$, of same size as original, drawn with replacement.
The chance of a sample to have appeared in the bootstrap is:\\
$1-(1-\frac{1}{n})\stackrel{n\to\infty}{\to} 1-\frac{1}{e}\approx 0.632$. So if we compute the ERM on $\mathcal{Z}$ we could get 63\% accuracy by memorization. Over-confident!\\
\textbf{Leave-one-out}: compensates by computing the ERM where no memorization was for specific sample. E.g., for classification, like cross-validation:\\
$\hat{\mathcal{R}}(S(\mathcal{Z}))=\frac{1}{B}\sum_{b=1}^B\sum_{z_i\not\in\mathcal{Z}^{*b}}\frac{\mathbb{I}_{c(x_i)\neq y_i}}{B-\lvert\mathcal{Z}^{*b}\rvert}$
\subsection*{Jackknife}
Estimate of an estimator $\hat{S}_n$'s Bias.\\
$\hat{S}^{JK}=\hat{S}_n-\mathrm{bias}^{JK}$ is JK Estimator.\\
$\mathrm{bias}^{JK}{=}(n{-}1)(\tilde{S}_n{-}\hat{S}_m)$\\
$\tilde{S}_n{=}\frac{1}{n}\sum_{i=1}^n\hat{S}_{n{-}1}^{-i}$ avg. LOO Estimator.
Debiased est. can have big variance!\\
\textbf{Bootstrap debiased}\\ $\bar{S}{=}2\hat{S}{-}\frac{1}{B}\sum_b\hat{S}^*(b)$
\section*{Model selection}
\textbf{Bayesian Information Criterion\\ (BIC)}: $-2\log\mathbb{P}(\mathcal{X}|\theta_k)+k\log(n)$. Penalizes larger models (larger $k$ with more data $n$). Tendency to underfit.\\
\textbf{Akaike Information Criterion\\ (AIC)}: $-2\log\mathbb{P}(\mathcal{X}|\theta_k)+2k$. Tendency to overfit.\\
\textbf{Takeuchi Information Criterion\\ (TIC)}: $-2\log\mathbb{P}(\mathcal{X}|\theta_k)+2\text{Tr}(\mathcal{I}\mathcal{J}^{-1})$.