From 797141e75b261d00f596e260563b3e947e216d40 Mon Sep 17 00:00:00 2001 From: Nicholas Mancuso Date: Fri, 7 Feb 2025 11:40:41 -0800 Subject: [PATCH] Created using Colab --- Lab_3_Optimization_PtI.ipynb | 53 +++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/Lab_3_Optimization_PtI.ipynb b/Lab_3_Optimization_PtI.ipynb index fb15533..8a5feb4 100644 --- a/Lab_3_Optimization_PtI.ipynb +++ b/Lab_3_Optimization_PtI.ipynb @@ -4,7 +4,7 @@ "metadata": { "colab": { "provenance": [], - "authorship_tag": "ABX9TyPXPoElyGI0ZrWueAPTQfIg", + "authorship_tag": "ABX9TyN+9Dh/qsCaBlszqM1dDDQa", "include_colab_link": true }, "kernelspec": { @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "id": "VAK5-ADoNq_I" }, @@ -125,7 +125,7 @@ "id": "wWoIiwnVwn6O", "outputId": "400bde50-5cf3-4de8-8640-7bd73379f6b1" }, - "execution_count": 5, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -162,7 +162,7 @@ "id": "5Z8awgX6GXXv", "outputId": "67b111ee-2e05-415a-a097-c278e622ea9d" }, - "execution_count": 7, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -229,7 +229,7 @@ "id": "y3kQz53WA8YS", "outputId": "08a31e4d-46c3-480c-b52d-d19730b40eac" }, - "execution_count": 8, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -286,7 +286,7 @@ "id": "-amDd6YTLuNI", "outputId": "5c17897c-4ba8-42e9-8831-424244458156" }, - "execution_count": 9, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -304,7 +304,30 @@ "cell_type": "markdown", "source": [ "# Gradient descent\n", - "TBD: Add notes for gradient descent" + "[Gradient descent](https://en.wikipedia.org/wiki/Gradient_descent) seeks to iteratively optimize a function $f(\\beta)$ by taking steps in the steepest direction,\n", + "$$ \\hat{\\beta} = \\beta_t - \\rho_t \\nabla f(\\beta_t),$$\n", + "where that direction is provided by the [gradient](https://en.wikipedia.org/wiki/Gradient) of (f).\n", + "\n", + "A helpful way to recast gradient descent is that we seek to perform a series of _local_ optimizations,\n", + "\n", + "$$\\hat{\\beta} = \\min_\\beta \\nabla f(\\beta_t)^T \\beta + \\frac{1}{2\\rho_t}\\|\\beta - \\beta_t\\|_2^2.$$\n", + "\n", + "To see how these are equivalent let's solve the local problem. but using inner product notation,\n", + "$$m(\\beta) = \\nabla f(\\beta_t)^T \\beta + \\frac{1}{2\\rho_t} (\\beta - \\beta_t)^T(\\beta - \\beta_t).$$\n", + "Now, using calculus again,\n", + "$$\\begin{align*}\n", + "\\nabla m(\\beta) &= \\nabla [ \\nabla f(\\beta_t)^T \\beta + \\frac{1}{2\\rho_t} (\\beta - \\beta_t)^T(\\beta - \\beta_t)] \\\\\n", + "&= \\nabla [\\nabla f(\\beta_t)^T \\beta] + \\frac{1}{2\\rho_t} \\nabla [(\\beta - \\beta_t)^T(\\beta - \\beta_t)] \\\\\n", + "&= \\nabla f(\\beta_t) + \\frac{1}{\\rho_t}(\\beta - \\beta_t) \\Rightarrow \\\\\n", + "\\hat{\\beta} &= \\beta_t - \\rho_t \\nabla f(\\beta_t).\n", + "\\end{align*}\n", + "$$\n", + "\n", + "Neat! However, notice that the original local objective can be thought of as minimizing the directional derivative, but with a distance penalty, where that distance is defined by the geometry of the parameter space.\n", + "\n", + "$$\\hat{\\beta} = \\min_\\beta \\nabla f(\\beta_t)^T \\beta + \\frac{1}{2\\rho_t}\\text{dist}(\\beta, \\beta_t).$$\n", + "\n", + "When the natural geometry is $\\mathbb{R}^p$ then $\\text{dist}(\\cdot) = \\| \\cdot \\|_2^2$, however there are many geometries that can describe the natural parameter space (for future class 😉)" ], "metadata": { "id": "I5mFAyAINs-B" @@ -339,12 +362,10 @@ "y, X, beta = sim_linear_reg(sim_key, N, P)\n", "\n", "def linreg_loss(beta_hat, y, X):\n", - " pred = X @ beta_hat\n", - " return 0.5 * jnp.sum((y - pred)**2)\n", + " pass\n", "\n", "def gradient(beta_hat, y, X):\n", - " pred = X @ beta_hat\n", - " return -X.T @ (y - pred)\n", + " pass\n", "\n", "step_size = 1 / N\n", "diff = 10.\n", @@ -354,12 +375,15 @@ "# while delta in loss is large, continue\n", "print(f\"true beta = {beta}\")\n", "while jnp.fabs(diff) > 1e-3:\n", + "\n", " # take a step in the direction of the gradient using step_size\n", " beta_hat = beta_hat - step_size * gradient(beta_hat, y, X)\n", + "\n", " # update our current loss and compute delta\n", " cur_loss = linreg_loss(beta_hat, y, X)\n", " diff = last_loss - cur_loss\n", " last_loss = cur_loss\n", + "\n", " # wave to the crowd\n", " print(f\"Loss[{idx}]({beta_hat}) = {last_loss}\")\n", " idx += 1\n", @@ -375,7 +399,7 @@ "id": "g6LpP-pxNy8y", "outputId": "f8bf784f-e72a-4cc9-d58e-61a1abd328db" }, - "execution_count": 35, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -411,11 +435,14 @@ "print(f\"true beta = {beta}\")\n", "while jnp.fabs(diff) > 1e-3:\n", " # take a step in the direction of the gradient using step_size\n", + "\n", " beta_hat = beta_hat - step_size * jax.grad(linreg_loss)(beta_hat, y, X)\n", + "\n", " # update our current loss and compute delta\n", " cur_loss = linreg_loss(beta_hat, y, X)\n", " diff = last_loss - cur_loss\n", " last_loss = cur_loss\n", + "\n", " # wave to the crowd\n", " print(f\"Loss[{idx}]({beta_hat}) = {last_loss}\")\n", " idx += 1\n", @@ -431,7 +458,7 @@ "id": "pyZh3Msjuncp", "outputId": "d84627f8-b200-4f56-dbea-97d4fa45e621" }, - "execution_count": 33, + "execution_count": null, "outputs": [ { "output_type": "stream",