diff --git a/conflictfree/length_model.py b/conflictfree/length_model.py index c21ed5c..905d189 100644 --- a/conflictfree/length_model.py +++ b/conflictfree/length_model.py @@ -70,6 +70,7 @@ def rescale_length( class ProjectionLength(LengthModel): """ Rescale the length of the target vector based on the projection of the gradients on the target vector: + $$ |\mathbf{g}_c|=\sum_{i=1}^m|\mathbf{g}_i|\mathcal{S}_c(\mathbf{g}_i,\mathbf{g}_c) $$ @@ -160,6 +161,7 @@ class TrackMinimum(_FlexibleTrackProjectionLength): """ Rescale the length of the target vector based on the projection of the gradients on the target vector. All the gradients will be rescaled to the same length as the minimum gradient before projection, i.e., the minimum gradient will be the same length as the target vector. + $$ |\mathbf{g}_c|=\sum_{i=1}^m|\mathbf{g}_{min}|\mathcal{S}_c(\mathbf{g}_i,\mathbf{g}_c) $$ @@ -176,6 +178,7 @@ class TrackMaximum(_FlexibleTrackProjectionLength): """ Rescale the length of the target vector based on the projection of the gradients on the target vector. All the gradients will be rescaled to the same length as the maximum gradient before projection, i.e., the maximum gradient will be the same length as the target vector. + $$ |\mathbf{g}_c|=\sum_{i=1}^m|\mathbf{g}_{max}|\mathcal{S}_c(\mathbf{g}_i,\mathbf{g}_c) $$ @@ -192,14 +195,18 @@ class TrackHarmonicAverage(_FlexibleTrackProjectionLength): """ Rescale the length of the target vector based on the projection of the gradients on the target vector. All the gradients will be rescaled to the harmonic average of the lengths of all gradients before projection, i.e., the minimum gradient will be the same length as the target vector. + $$ |\mathbf{g}_c|=\sum_{i=1}^m\overline{|\mathbf{g}|}_{harm}\mathcal{S}_c(\mathbf{g}_i,\mathbf{g}_c) $$ + where + $$ \overline{|\mathbf{g}|}_{harm}=\frac{m}{\sum_{i=1}^m \frac{1}{|\mathbf{g}_i|}} $$ - The harmonic average is used to avoid the influence of the large gradients. + + The harmonic average can be used to avoid the influence of the large gradients. """ def __init__(self): @@ -213,10 +220,13 @@ class TrackArithmeticAverage(_FlexibleTrackProjectionLength): """ Rescale the length of the target vector based on the projection of the gradients on the target vector. All the gradients will be rescaled to the arithmetic average of the lengths of all gradients before projection, i.e., the minimum gradient will be the same length as the target vector. + $$ |\mathbf{g}_c|=\sum_{i=1}^m\overline{|\mathbf{g}|}_{arith}\mathcal{S}_c(\mathbf{g}_i,\mathbf{g}_c) $$ + where + $$ \overline{|\mathbf{g}|}_{arith}=\frac{1}{m}\sum_{i=1}^m |\mathbf{g}_i| $$ @@ -233,14 +243,18 @@ class TrackGeometricAverage(_FlexibleTrackProjectionLength): """ Rescale the length of the target vector based on the projection of the gradients on the target vector. All the gradients will be rescaled to the geometric average of the lengths of all gradients before projection, i.e., the minimum gradient will be the same length as the target vector. + $$ |\mathbf{g}_c|=\sum_{i=1}^m\overline{|\mathbf{g}|}_{geom}\mathcal{S}_c(\mathbf{g}_i,\mathbf{g}_c) $$ + where + $$ \overline{|\mathbf{g}|}_{geom}=\left(\prod_{i=1}^m |\mathbf{g}_i|\right)^{\frac{1}{m}} $$ - The geometric average is used to avoid the influence of the large gradients. + + The geometric average can be used to avoid the influence of the large gradients. """ def __init__(self): @@ -255,6 +269,7 @@ class TrackSpecific(_FlexibleTrackProjectionLength): Rescale the length of the target vector based on the projection of the gradients on the target vector. All the gradients will be rescaled to the same length as the specific gradient before projection. E.g., if the track_id is 2, then all the gradients will be rescaled to the same length as the third gradient before projection. + $$ |\mathbf{g}_c|=\sum_{i=1}^m\overline{|\mathbf{g}|}_{track_id}\mathcal{S}_c(\mathbf{g}_i,\mathbf{g}_c) $$ diff --git a/docs/api/length_model.md b/docs/api/length_model.md index e0f8ca1..755ded1 100644 --- a/docs/api/length_model.md +++ b/docs/api/length_model.md @@ -3,6 +3,12 @@ The `ProjectionLength` class is the default length model for the ConFIG algorith ## Length Model ::: conflictfree.length_model.ProjectionLength +::: conflictfree.length_model.TrackMinimum +::: conflictfree.length_model.TrackMaximum +::: conflictfree.length_model.TrackHarmonicAverage +::: conflictfree.length_model.TrackArithmeticAverage +::: conflictfree.length_model.TrackGeometricAverage +::: conflictfree.length_model.TrackSpecific ## Base Class of Length Model ::: conflictfree.length_model.LengthModel \ No newline at end of file diff --git a/docs/assets/troubleshooting/difweightmodel.png b/docs/assets/troubleshooting/difweightmodel.png new file mode 100644 index 0000000..e43090c Binary files /dev/null and b/docs/assets/troubleshooting/difweightmodel.png differ diff --git a/docs/assets/troubleshooting/difweightmodel.svg b/docs/assets/troubleshooting/difweightmodel.svg new file mode 100644 index 0000000..02e60c6 --- /dev/null +++ b/docs/assets/troubleshooting/difweightmodel.svg @@ -0,0 +1,1291 @@ + + + + diff --git a/docs/examples/mtl_toy.ipynb b/docs/examples/mtl_toy.ipynb index b1d4a97..9594f87 100644 --- a/docs/examples/mtl_toy.ipynb +++ b/docs/examples/mtl_toy.ipynb @@ -9,7 +9,6 @@ "Here, we would like to show a classic and interesting toy example of multi-task learning (MTL). \n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tum-pbs/ConFIG/blob/main/docs/examples/mtl_toy.ipynb)\n", - "[![Open Locally](../assets/download.svg)](https://github.com/tum-pbs/ConFIG/blob/main/docs/examples/mtl_toy.ipynb)\n", "\n", "In this example, there are two tasks represented by two loss functions, which are" ] @@ -446,7 +445,7 @@ ], "metadata": { "kernelspec": { - "display_name": "deeplearning", + "display_name": "config", "language": "python", "name": "python3" }, diff --git a/docs/examples/pinn_burgers.ipynb b/docs/examples/pinn_burgers.ipynb index 505e6af..064ddfe 100644 --- a/docs/examples/pinn_burgers.ipynb +++ b/docs/examples/pinn_burgers.ipynb @@ -10,7 +10,6 @@ "In this example, we would like to show you another example of how to use ConFIG method to train a physics informed neural network (PINN) for solving a PDE. \n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tum-pbs/ConFIG/blob/main/docs/examples/pinn_burgers.ipynb)\n", - "[![Open Locally](../assets/download.svg)](https://github.com/tum-pbs/ConFIG/blob/main/docs/examples/pinn_burgers.ipynb)\n", "\n", "In this example, we will solve the 1D Burgers' equation:\n", "\n", diff --git a/docs/requirements.txt b/docs/requirements.txt index f4af116..02b2734 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,4 +3,5 @@ black mkdocs-material mkdocstrings mkdocstrings-python -mknotebooks +#mknotebooks +mkdocs-jupyter diff --git a/docs/start/troubleshooting.ipynb b/docs/start/troubleshooting.ipynb new file mode 100644 index 0000000..8fc9743 --- /dev/null +++ b/docs/start/troubleshooting.ipynb @@ -0,0 +1,179 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Troubleshooting\n", + "\n", + "Since we released the ConFIG method, we have received a lot of feedback from our users. Among many interesting discussions, we have found many useful tricks that may be helpful for the community. So, if you don't get a good result using ConFIG, please have a check on the following list:\n", + "\n", + "## Are you using different weight models?\n", + "\n", + "Introducing new direction weight models may raise some issues. For example, if you set your weights to $[1,2]$ for a two-loss scenario, the following two scenarios will both satisfy your weight condition:\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "\n", + "Although in both situations the $\\mathbf{g}_c$ is a conflict-free direction, as $g_c$ has a positive dot product to both $\\mathbf{g}_1$ and $\\mathbf{g}_2$. However, the situation in figure b) might not be the optimal direction. This situation will not occur when you are using the default equal weight models. Thus, we would recommend using the default weighting configuration as much as possible. \n", + "\n", + "## Are you using momentum-based optimizers?\n", + "\n", + "Momentum-based optimizers (here, we only refer to the optimizer that involves both the first and second momentum, e.g., Adam) might face some issues when you are using the default length model. In the default length model, the magnitude of the update gradient is calculated based on the sum of the projection length of each loss-specific direction, i.e.,\n", + "\n", + "$$\n", + "|\\mathbf{g}_{\\text{ConFIG}}|=\\sum_{i=1}^m \\mathbf{g}_i^\\top\\mathbf{g}_u=\\sum_{i=1}^m |\\mathbf{g}_i|\\mathcal{S}_c(\\mathbf{g}_i,\\mathbf{g}_u).\n", + "$$\n", + "\n", + "This means that the final magnitude of the update gradient relies on the \"angle\" between $\\mathbf{g}_i$s and the magnitude of each $\\mathbf{g_i}$. So, if one of your loss-specific gradients has a much larger magnitude than other gradients, then it will cover the magnitude distribution of other gradients.\n", + "\n", + "If you are using a momentum-based optimizer, the absolute value of $\\mathbf{g}_{\\text{ConFIG}}$ actually doesn't matter too much as momentum-based optimizers will adjust the learning rate (length of the update gradient) according to how the gradient changes. If the gradient changes rapidly, then the learning rate will be very small. Thus, if you have a very large loss-specific gradient, the momentum-based optimizers will just change the learning rate according to how the magnitude of this largest gradient changes and ignore the contribution from other gradients (the learning rate also depends on how the angle between loss-specific gradients, $\\mathcal{S}_c(\\mathbf{g}_i,\\mathbf{g}_u)$ changes, of course.).\n", + "\n", + "In our `conflictfree` package, we provide several following [length_model](../../api/length_model/) which can help you to decide which gradients' magnitude you want to track to adjust the learning rate in momentum-based optimizers. Here, we can use a simple example to illustrate the differences btween these length models:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "