From 797141e75b261d00f596e260563b3e947e216d40 Mon Sep 17 00:00:00 2001
From: Nicholas Mancuso <nick.mancuso@gmail.com>
Date: Fri, 7 Feb 2025 11:40:41 -0800
Subject: [PATCH] Created using Colab

---
 Lab_3_Optimization_PtI.ipynb | 53 +++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/Lab_3_Optimization_PtI.ipynb b/Lab_3_Optimization_PtI.ipynb
index fb15533..8a5feb4 100644
--- a/Lab_3_Optimization_PtI.ipynb
+++ b/Lab_3_Optimization_PtI.ipynb
@@ -4,7 +4,7 @@
   "metadata": {
     "colab": {
       "provenance": [],
-      "authorship_tag": "ABX9TyPXPoElyGI0ZrWueAPTQfIg",
+      "authorship_tag": "ABX9TyN+9Dh/qsCaBlszqM1dDDQa",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -39,7 +39,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": null,
       "metadata": {
         "id": "VAK5-ADoNq_I"
       },
@@ -125,7 +125,7 @@
         "id": "wWoIiwnVwn6O",
         "outputId": "400bde50-5cf3-4de8-8640-7bd73379f6b1"
       },
-      "execution_count": 5,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -162,7 +162,7 @@
         "id": "5Z8awgX6GXXv",
         "outputId": "67b111ee-2e05-415a-a097-c278e622ea9d"
       },
-      "execution_count": 7,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -229,7 +229,7 @@
         "id": "y3kQz53WA8YS",
         "outputId": "08a31e4d-46c3-480c-b52d-d19730b40eac"
       },
-      "execution_count": 8,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -286,7 +286,7 @@
         "id": "-amDd6YTLuNI",
         "outputId": "5c17897c-4ba8-42e9-8831-424244458156"
       },
-      "execution_count": 9,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -304,7 +304,30 @@
       "cell_type": "markdown",
       "source": [
         "# Gradient descent\n",
-        "TBD: Add notes for gradient descent"
+        "[Gradient descent](https://en.wikipedia.org/wiki/Gradient_descent) seeks to iteratively optimize a function $f(\\beta)$ by taking steps in the steepest direction,\n",
+        "$$ \\hat{\\beta} = \\beta_t - \\rho_t \\nabla f(\\beta_t),$$\n",
+        "where that direction is provided by the [gradient](https://en.wikipedia.org/wiki/Gradient) of (f).\n",
+        "\n",
+        "A helpful way to recast gradient descent is that we seek to perform a series of _local_ optimizations,\n",
+        "\n",
+        "$$\\hat{\\beta} = \\min_\\beta \\nabla f(\\beta_t)^T \\beta + \\frac{1}{2\\rho_t}\\|\\beta - \\beta_t\\|_2^2.$$\n",
+        "\n",
+        "To see how these are equivalent let's solve the local problem. but using inner product notation,\n",
+        "$$m(\\beta) = \\nabla f(\\beta_t)^T \\beta + \\frac{1}{2\\rho_t} (\\beta - \\beta_t)^T(\\beta - \\beta_t).$$\n",
+        "Now, using calculus again,\n",
+        "$$\\begin{align*}\n",
+        "\\nabla m(\\beta) &= \\nabla [ \\nabla f(\\beta_t)^T \\beta + \\frac{1}{2\\rho_t} (\\beta - \\beta_t)^T(\\beta - \\beta_t)] \\\\\n",
+        "&= \\nabla [\\nabla f(\\beta_t)^T \\beta] + \\frac{1}{2\\rho_t} \\nabla [(\\beta - \\beta_t)^T(\\beta - \\beta_t)] \\\\\n",
+        "&= \\nabla f(\\beta_t) + \\frac{1}{\\rho_t}(\\beta - \\beta_t) \\Rightarrow \\\\\n",
+        "\\hat{\\beta} &= \\beta_t - \\rho_t \\nabla f(\\beta_t).\n",
+        "\\end{align*}\n",
+        "$$\n",
+        "\n",
+        "Neat! However, notice that the original local objective can be thought of as minimizing the directional derivative, but with a distance penalty, where that distance is defined by the geometry of the parameter space.\n",
+        "\n",
+        "$$\\hat{\\beta} = \\min_\\beta \\nabla f(\\beta_t)^T \\beta + \\frac{1}{2\\rho_t}\\text{dist}(\\beta, \\beta_t).$$\n",
+        "\n",
+        "When the natural geometry is $\\mathbb{R}^p$ then $\\text{dist}(\\cdot) = \\| \\cdot \\|_2^2$, however there are many  geometries that can describe the natural parameter space (for future class 😉)"
       ],
       "metadata": {
         "id": "I5mFAyAINs-B"
@@ -339,12 +362,10 @@
         "y, X, beta = sim_linear_reg(sim_key, N, P)\n",
         "\n",
         "def linreg_loss(beta_hat, y, X):\n",
-        "  pred = X @ beta_hat\n",
-        "  return 0.5 * jnp.sum((y - pred)**2)\n",
+        "  pass\n",
         "\n",
         "def gradient(beta_hat, y, X):\n",
-        "  pred = X @ beta_hat\n",
-        "  return -X.T @ (y - pred)\n",
+        "  pass\n",
         "\n",
         "step_size = 1 / N\n",
         "diff = 10.\n",
@@ -354,12 +375,15 @@
         "# while delta in loss is large, continue\n",
         "print(f\"true beta = {beta}\")\n",
         "while jnp.fabs(diff) > 1e-3:\n",
+        "\n",
         "  # take a step in the direction of the gradient using step_size\n",
         "  beta_hat = beta_hat - step_size * gradient(beta_hat, y, X)\n",
+        "\n",
         "  # update our current loss and compute delta\n",
         "  cur_loss = linreg_loss(beta_hat, y, X)\n",
         "  diff = last_loss - cur_loss\n",
         "  last_loss = cur_loss\n",
+        "\n",
         "  # wave to the crowd\n",
         "  print(f\"Loss[{idx}]({beta_hat}) = {last_loss}\")\n",
         "  idx += 1\n",
@@ -375,7 +399,7 @@
         "id": "g6LpP-pxNy8y",
         "outputId": "f8bf784f-e72a-4cc9-d58e-61a1abd328db"
       },
-      "execution_count": 35,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -411,11 +435,14 @@
         "print(f\"true beta = {beta}\")\n",
         "while jnp.fabs(diff) > 1e-3:\n",
         "  # take a step in the direction of the gradient using step_size\n",
+        "\n",
         "  beta_hat = beta_hat - step_size * jax.grad(linreg_loss)(beta_hat, y, X)\n",
+        "\n",
         "  # update our current loss and compute delta\n",
         "  cur_loss = linreg_loss(beta_hat, y, X)\n",
         "  diff = last_loss - cur_loss\n",
         "  last_loss = cur_loss\n",
+        "\n",
         "  # wave to the crowd\n",
         "  print(f\"Loss[{idx}]({beta_hat}) = {last_loss}\")\n",
         "  idx += 1\n",
@@ -431,7 +458,7 @@
         "id": "pyZh3Msjuncp",
         "outputId": "d84627f8-b200-4f56-dbea-97d4fa45e621"
       },
-      "execution_count": 33,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",