Merge pull request #143 from ecmwf/develop

Release 0.3.0
ecmwf · Nov 14, 2024 · 64915e6 · 64915e6
2 parents ed56f9d + 76d3ef6
commit 64915e6
Show file tree

Hide file tree

Showing 101 changed files with 6,601 additions and 1,393 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -35,7 +35,7 @@ jobs:
   # Run CI including downstream packages on self-hosted runners
   downstream-ci:
     name: downstream-ci
-    if: ${{ !github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci' }}
+    if: ${{ !contains(github.repository, 'private') && (!github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci') }}
     uses: ecmwf-actions/downstream-ci/.github/workflows/downstream-ci.yml@main
     with:
       anemoi-training: ecmwf/anemoi-training@${{ github.event.pull_request.head.sha || github.sha }}
@@ -45,7 +45,7 @@ jobs:
    # Build downstream packages on HPC
   downstream-ci-hpc:
     name: downstream-ci-hpc
-    if: ${{ !github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci' }}
+    if: ${{ !contains(github.repository, 'private') && (!github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci') }}
     uses: ecmwf-actions/downstream-ci/.github/workflows/downstream-ci-hpc.yml@main
     with:
       anemoi-training: ecmwf/anemoi-training@${{ github.event.pull_request.head.sha || github.sha }}

diff --git a/.github/workflows/push-to-private.yml b/.github/workflows/push-to-private.yml
@@ -0,0 +1,33 @@
+name: Push to private repository
+
+on:
+  push:
+    branches:
+      - develop
+
+jobs:
+  push_changes:
+    if: ${{ !contains(github.repository, 'private') }}
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout source repository
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+        fetch-tags: true
+
+    - name: Set up Git configuration
+      run: |
+        git config user.name "github-actions[bot]"
+        git config user.email "github-actions[bot]@users.noreply.github.com"
+
+    - name: Setup SSH key
+      uses: webfactory/[email protected]
+      with:
+        ssh-private-key: ${{ secrets.KEY_TO_PRIVATE }}
+
+    - name: Push changes to private repository
+      run: |
+        git remote add private [email protected]:${{ github.repository }}-private.git
+        git push --set-upstream private develop
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -10,11 +10,13 @@ on:
 
 jobs:
   quality:
+    if: ${{ !contains(github.repository, 'private') }}
     uses: ecmwf-actions/reusable-workflows/.github/workflows/qa-precommit-run.yml@v2
     with:
       skip-hooks: "no-commit-to-branch"
 
   checks:
+    if: ${{ !contains(github.repository, 'private') }}
     strategy:
       matrix:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
@@ -23,6 +25,7 @@ jobs:
       python-version: ${{ matrix.python-version }}
 
   deploy:
+    if: ${{ !contains(github.repository, 'private') }}
     needs: [checks, quality]
     uses: ecmwf-actions/reusable-workflows/.github/workflows/cd-pypi.yml@v2
     secrets: inherit
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,7 +27,7 @@ repos:
     -   id: python-check-blanket-noqa # Check for # noqa: all
     -   id: python-no-log-warn # Check for log.warn
 - repo: https://github.com/psf/black-pre-commit-mirror
-  rev: 24.8.0
+  rev: 24.10.0
   hooks:
   - id: black
     args: [--line-length=120]
@@ -40,16 +40,15 @@ repos:
     - --force-single-line-imports
     - --profile black
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.9
+  rev: v0.7.2
   hooks:
   - id: ruff
-    # Next line if for documenation cod snippets
-    exclude: '^[^_].*_\.py$'
     args:
     - --line-length=120
     - --fix
     - --exit-non-zero-on-fix
     - --preview
+    - --exclude=docs/**/*_.py
 - repo: https://github.com/sphinx-contrib/sphinx-lint
   rev: v1.0.0
   hooks:
@@ -60,13 +59,8 @@ repos:
   hooks:
   - id: rstfmt
     exclude: 'cli/.*' # Because we use argparse
-- repo: https://github.com/b8raoult/pre-commit-docconvert
-  rev: "0.1.5"
-  hooks:
-  - id: docconvert
-    args: ["numpy"]
 - repo: https://github.com/tox-dev/pyproject-fmt
-  rev: "2.2.4"
+  rev: "v2.5.0"
   hooks:
   - id: pyproject-fmt
 -   repo: https://github.com/jshwi/docsig # Check docstrings against function sig

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,12 +10,40 @@ Keep it human-readable, your future self will thank you!
 
 ## [Unreleased](https://github.com/ecmwf/anemoi-training/compare/0.2.2...HEAD)
 
+### Fixed
+- Rename loss_scaling to variable_loss_scaling [#138](https://github.com/ecmwf/anemoi-training/pull/138)
+- Refactored callbacks. [#60](https://github.com/ecmwf/anemoi-training/pulls/60)
+    - Updated docs [#115](https://github.com/ecmwf/anemoi-training/pull/115)
+    - Fix enabling LearningRateMonitor [#119](https://github.com/ecmwf/anemoi-training/pull/119)
+- Refactored rollout [#87](https://github.com/ecmwf/anemoi-training/pulls/87)
+    - Enable longer validation rollout than training
+- Expand iterables in logging [#91](https://github.com/ecmwf/anemoi-training/pull/91)
+    - Save entire config in mlflow
+### Added
+- Included more loss functions and allowed configuration [#70](https://github.com/ecmwf/anemoi-training/pull/70)
+   - Fix that applies the metric_ranges in the post-processed variable space [#116](https://github.com/ecmwf/anemoi-training/pull/116)
+- Allow updates to scalars [#137](https://github.com/ecmwf/anemoi-training/pulls/137)
+    - Add without subsetting in ScaleTensor
+- Sub-hour datasets [#63](https://github.com/ecmwf/anemoi-training/pull/63)
+- Add synchronisation workflow [#92](https://github.com/ecmwf/anemoi-training/pull/92)
+- Feat: Anemoi Profiler compatible with mlflow and using Pytorch (Kineto) Profiler for memory report [38](https://github.com/ecmwf/anemoi-training/pull/38/)
+- New limited area config file added, limited_area.yaml. [#134](https://github.com/ecmwf/anemoi-training/pull/134/)
+- New stretched grid config added, stretched_grid.yaml [#133](https://github.com/ecmwf/anemoi-training/pull/133)
+
+### Changed
+- Renamed frequency keys in callbacks configuration. [#118](https://github.com/ecmwf/anemoi-training/pull/118)
+- Modified training configuration to support max_steps and tied lr iterations to max_steps by default [#67](https://github.com/ecmwf/anemoi-training/pull/67)
+- Merged node & edge trainable feature callbacks into one. [#135](https://github.com/ecmwf/anemoi-training/pull/135)
+
 ## [0.2.2 - Maintenance: pin python <3.13](https://github.com/ecmwf/anemoi-training/compare/0.2.1...0.2.2) - 2024-10-28
 
+
 ### Changed
 
 - Lock python version <3.13 [#107](https://github.com/ecmwf/anemoi-training/pull/107)
 
+
+
 ## [0.2.1 - Bugfix: resuming mlflow runs](https://github.com/ecmwf/anemoi-training/compare/0.2.0...0.2.1) - 2024-10-24
 
 ### Added
@@ -27,6 +55,10 @@ Keep it human-readable, your future self will thank you!
 
 ### Fixed
 
+- Fix pre-commit regex
+- Mlflow-sync to handle creation of new experiments in the remote server [#83] (https://github.com/ecmwf/anemoi-training/pull/83)
+- Fix for multi-gpu when using mlflow due to refactoring of _get_mlflow_run_params function [#99] (https://github.com/ecmwf/anemoi-training/pull/99)
+- ci: fix pyshtools install error (#100) https://github.com/ecmwf/anemoi-training/pull/100
 - Mlflow-sync to handle creation of new experiments in the remote server [#83](https://github.com/ecmwf/anemoi-training/pull/83)
 - Fix for multi-gpu when using mlflow due to refactoring of _get_mlflow_run_params function [#99](https://github.com/ecmwf/anemoi-training/pull/99)
 - ci: fix pyshtools install error [#100](https://github.com/ecmwf/anemoi-training/pull/100)
@@ -51,6 +83,8 @@ Keep it human-readable, your future self will thank you!
 
 - Introduction of remapper to anemoi-models leads to changes in the data indices. Some preprocessors cannot be applied in-place anymore.
 
+- Variable Bounding as configurable model layers [#13](https://github.com/ecmwf/anemoi-models/issues/13)
+
 #### Functionality
 
 - Enable the callback for plotting a histogram for variables containing NaNs
@@ -62,6 +96,7 @@ Keep it human-readable, your future self will thank you!
 - Feature: `AnemoiMlflowClient`, an mlflow client with authentication support [#86](https://github.com/ecmwf/anemoi-training/pull/86)
 - Long Rollout Plots
 
+
 ### Fixed
 
 - Fix `TypeError` raised when trying to JSON serialise `datetime.timedelta` object - [#43](https://github.com/ecmwf/anemoi-training/pull/43)
@@ -76,6 +111,7 @@ Keep it human-readable, your future self will thank you!
 - Updated configuration examples in documentation and corrected links - [#46](https://github.com/ecmwf/anemoi-training/pull/46)
 - Remove credential prompt from mlflow login, replace with seed refresh token via web - [#78](https://github.com/ecmwf/anemoi-training/pull/78)
 - Update CODEOWNERS
+- Change how mlflow measures CPU Memory usage - [94](https://github.com/ecmwf/anemoi-training/pull/94)
 
 ## [0.1.0 - Anemoi training - First release](https://github.com/ecmwf/anemoi-training/releases/tag/0.1.0) - 2024-08-16
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -1,3 +1,12 @@
+# (C) Copyright 2024 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
@@ -13,10 +22,11 @@
 import datetime
 import os
 import sys
+from pathlib import Path
 
 read_the_docs_build = os.environ.get("READTHEDOCS", None) == "True"
 
-sys.path.insert(0, os.path.join(os.path.abspath(".."), "src"))
+sys.path.insert(0, Path("..").absolute() / "src")
 
 
 source_suffix = ".rst"
@@ -30,13 +40,12 @@
 
 project = "Anemoi Training"
 
-author = "ECMWF"
+author = "Anemoi contributors"
 
-year = datetime.datetime.now().year
+year = datetime.datetime.now(tz="UTC").year
 years = "2024" if year == 2024 else f"2024-{year}"
 
-copyright = f"{years}, ECMWF"
-
+copyright = f"{years}, Anemoi contributors"  # noqa: A001
 
 try:
     from anemoi.training._version import __version__
@@ -64,7 +73,7 @@
 ]
 
 # Add any paths that contain templates here, relative to this directory.
-# templates_path = ["_templates"]
+# templates_path = ["_templates"] # noqa: ERA001
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.

diff --git a/docs/images/profiler/anemoi_profiler_architecture.png b/docs/images/profiler/anemoi_profiler_architecture.png
diff --git a/docs/images/profiler/anemoi_profiler_benchmark_profiler.png b/docs/images/profiler/anemoi_profiler_benchmark_profiler.png
diff --git a/docs/images/profiler/anemoi_profiler_config.png b/docs/images/profiler/anemoi_profiler_config.png
diff --git a/docs/images/profiler/anemoi_profiler_high_level.png b/docs/images/profiler/anemoi_profiler_high_level.png
diff --git a/docs/images/profiler/anemoi_profiler_mlflow_integration.png b/docs/images/profiler/anemoi_profiler_mlflow_integration.png
diff --git a/docs/images/profiler/anemoi_profiler_mlflow_integration_2.png b/docs/images/profiler/anemoi_profiler_mlflow_integration_2.png
diff --git a/docs/images/profiler/anemoi_profiler_mlflow_integration_3.png b/docs/images/profiler/anemoi_profiler_mlflow_integration_3.png
diff --git a/docs/images/profiler/anemoi_profiler_speed_report.png b/docs/images/profiler/anemoi_profiler_speed_report.png
diff --git a/docs/images/profiler/anemoi_profiler_speedreport_diagram.png b/docs/images/profiler/anemoi_profiler_speedreport_diagram.png
diff --git a/docs/images/profiler/anemoi_profiler_training_rates.png b/docs/images/profiler/anemoi_profiler_training_rates.png
diff --git a/docs/images/profiler/anemoi_profiler_validation_rates.png b/docs/images/profiler/anemoi_profiler_validation_rates.png
diff --git a/docs/images/profiler/example_memory_report.png b/docs/images/profiler/example_memory_report.png
diff --git a/docs/images/profiler/example_memory_timeline.png b/docs/images/profiler/example_memory_timeline.png
diff --git a/docs/images/profiler/example_model_summary.png b/docs/images/profiler/example_model_summary.png
diff --git a/docs/images/profiler/example_model_summary_2.png b/docs/images/profiler/example_model_summary_2.png
diff --git a/docs/images/profiler/example_system_report.png b/docs/images/profiler/example_system_report.png
diff --git a/docs/images/profiler/example_time_report.png b/docs/images/profiler/example_time_report.png
diff --git a/docs/images/profiler/idle_time_breakdown.png b/docs/images/profiler/idle_time_breakdown.png
diff --git a/docs/images/profiler/kernel_breakdown_dfs.png b/docs/images/profiler/kernel_breakdown_dfs.png
diff --git a/docs/images/profiler/kernel_breakdown_plots.png b/docs/images/profiler/kernel_breakdown_plots.png
diff --git a/docs/images/profiler/memory_snapshot_diagram.png b/docs/images/profiler/memory_snapshot_diagram.png
diff --git a/docs/images/profiler/memory_snapshot_output.png b/docs/images/profiler/memory_snapshot_output.png
diff --git a/docs/images/profiler/temporal_breakdown.png b/docs/images/profiler/temporal_breakdown.png
diff --git a/docs/index.rst b/docs/index.rst
@@ -43,6 +43,7 @@ This package provides the *Anemoi* training functionality.
    user-guide/training
    user-guide/models
    user-guide/tracking
+   user-guide/benchmarking
    user-guide/distributed
    user-guide/debugging
 

diff --git a/docs/modules/diagnostics.rst b/docs/modules/diagnostics.rst
@@ -21,23 +21,94 @@ functionality to use both Weights & Biases and Tensorboard.
 
 The callbacks can also be used to evaluate forecasts over longer
 rollouts beyond the forecast time that the model is trained on. The
-number of rollout steps (or forecast iteration steps) is set using
-``config.eval.rollout = *num_of_rollout_steps*``.
-
-Note the user has the option to evaluate the callbacks asynchronously
-(using the following config option
-``config.diagnostics.plot.asynchronous``, which means that the model
-training doesn't stop whilst the callbacks are being evaluated).
-However, note that callbacks can still be slow, and therefore the
-plotting callbacks can be switched off by setting
-``config.diagnostics.plot.enabled`` to ``False`` or all the callbacks
-can be completely switched off by setting
-``config.diagnostics.eval.enabled`` to ``False``.
+number of rollout steps for verification (or forecast iteration steps)
+is set using ``config.dataloader.validation_rollout =
+*num_of_rollout_steps*``.
+
+Callbacks are configured in the config file under the
+``config.diagnostics`` key.
+
+For regular callbacks, they can be provided as a list of dictionaries
+underneath the ``config.diagnostics.callbacks`` key. Each dictionary
+must have a ``_target`` key which is used by hydra to instantiate the
+callback, any other kwarg is passed to the callback's constructor.
+
+.. code:: yaml
+
+   callbacks:
+      - _target_: anemoi.training.diagnostics.callbacks.evaluation.RolloutEval
+      rollout: ${dataloader.validation_rollout}
+      frequency: 20
+
+Plotting callbacks are configured in a similar way, but they are
+specified underneath the ``config.diagnostics.plot.callbacks`` key.
+
+This is done to ensure seperation and ease of configuration between
+experiments.
+
+``config.diagnostics.plot`` is a broader config file specifying the
+parameters to plot, as well as the plotting frequency, and
+asynchronosity.
+
+Setting ``config.diagnostics.plot.asynchronous``, means that the model
+training doesn't stop whilst the callbacks are being evaluated)
+
+.. code:: yaml
+
+   plot:
+      asynchronous: True # Whether to plot asynchronously
+      frequency: # Frequency of the plotting
+      batch: 750
+      epoch: 5
+
+      # Parameters to plot
+         parameters:
+         - z_500
+         - t_850
+         - u_850
+
+         # Sample index
+         sample_idx: 0
+
+         # Precipitation and related fields
+         precip_and_related_fields: [tp, cp]
+
+         callbacks:
+         - _target_: anemoi.training.diagnostics.callbacks.plot.PlotLoss
+            # group parameters by categories when visualizing contributions to the loss
+            # one-parameter groups are possible to highlight individual parameters
+            parameter_groups:
+               moisture: [tp, cp, tcw]
+               sfc_wind: [10u, 10v]
+         - _target_: anemoi.training.diagnostics.callbacks.plot.PlotSample
+            sample_idx: ${diagnostics.plot.sample_idx}
+            per_sample : 6
+            parameters: ${diagnostics.plot.parameters}
 
 Below is the documentation for the default callbacks provided, but it is
 also possible for users to add callbacks using the same structure:
 
-.. automodule:: anemoi.training.diagnostics.callbacks
+.. automodule:: anemoi.training.diagnostics.callbacks.checkpoint
+   :members:
+   :no-undoc-members:
+   :show-inheritance:
+
+.. automodule:: anemoi.training.diagnostics.callbacks.evaluation
+   :members:
+   :no-undoc-members:
+   :show-inheritance:
+
+.. automodule:: anemoi.training.diagnostics.callbacks.optimiser
+   :members:
+   :no-undoc-members:
+   :show-inheritance:
+
+.. automodule:: anemoi.training.diagnostics.callbacks.plot
+   :members:
+   :no-undoc-members:
+   :show-inheritance:
+
+.. automodule:: anemoi.training.diagnostics.callbacks.provenance
    :members:
    :no-undoc-members:
    :show-inheritance: