updated readme, new notebooks handling

nspiller · nspiller · Mar 28, 2024 · Mar 12, 2024 · Mar 28, 2024 · Mar 28, 2024
commit 04393aa8d026e5e7b6f9d34f4b54b96b11aa6f5e
diff --git a/.github/workflows/sync_notebooks.yml b/.github/workflows/sync_notebooks.yml
@@ -0,0 +1,115 @@
+name: Sync between Jupyter notebooks and Python plain text files
+
+on:
+  pull_request:
+    types: [opened, synchronize]
+  push:
+    branches:
+      - main
+
+jobs:
+  sync_notebooks:
+    runs-on: ubuntu-latest
+    env:
+      NOTEBOOK_FOLDER: notebooks
+      SCRIPT_FOLDER: notebooks/scripts
+    permissions:
+      contents: write
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+
+      - name: Install Jupytext
+        run: |
+          pip install 'jupytext~=1.0'
+
+      - name: Update notebook or script
+        run: |
+          shopt -s nullglob
+
+          # Function to check which file was last modified
+          function check_last_modified {
+            notebook=$1
+            script=$2
+
+            # Get the hash of the last commit that modified each file
+            last_commit_notebook=$(git log -1 --format="%H" -- $notebook)
+            last_commit_script=$(git log -1 --format="%H" -- $script)
+
+            # if one of the files does not exist, return the other
+            if [ -z "$last_commit_notebook" ]; then
+              echo script
+
+            elif [ -z "$last_commit_script" ]; then
+              echo notebook
+
+            else
+
+              # compare commit hashes of files with last common commit hash
+              last_common_commit=$(git merge-base $last_commit_notebook $last_commit_script)
+
+              if [ "$last_commit_notebook" != "$last_common_commit" ]; then
+                echo notebook
+
+              elif [ "$last_commit_script" != "$last_common_commit" ]; then
+                echo script
+
+              else
+                echo both
+              fi
+            fi
+          }
+
+          # locations of notebooks and scripts folders
+          NOTEBOOK_FOLDER=${{ env.NOTEBOOK_FOLDER }}
+          SCRIPT_FOLDER=${{ env.SCRIPT_FOLDER }}
+          mkdir -p $NOTEBOOK_FOLDER $SCRIPT_FOLDER
+
+          # catch all .py and .ipynb files, so the sync works if
+          # (i) both files exist
+          # (ii) either of the two exist
+          for file in $NOTEBOOK_FOLDER/*.ipynb $SCRIPT_FOLDER/*.py; do
+
+            # construct names for file pair
+            base="${file##*/}"
+            base="${base%.py}"
+            base="${base%.ipynb}"
+            notebook=$NOTEBOOK_FOLDER/"$base.ipynb"
+            script=$SCRIPT_FOLDER/"$base.py"
+
+            # Check which file was last modified
+            last_modified=$(check_last_modified "$notebook" "$script")
+
+            # update or create the paired file
+            if [ "$last_modified" == "notebook" ]; then
+              echo "Notebook file $notebook is newest. Updating/creating script file."
+              jupytext --opt notebook_metadata_filter="-all" --opt cell_metadata_filter="-all" --to py:percent "$notebook" --output "$script"
+              git add "$script"
+
+            elif [ "$last_modified" == "script" ]; then
+              echo "Notebook file $notebook is oldest. Updating/creating notebook file."
+              jupytext --update --to ipynb "$script" --output "$notebook"
+              git add "$notebook"
+
+            elif [ "$last_modified" == "both" ]; then
+              echo "Both $notebook and $script were last modified in same commit. Assuming they are in sync."
+
+            fi
+          done
+
+          # Git setup
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git config --global user.name "github-actions[bot]"
+
+          # commit and push changes
+          git commit -m "Sync notebooks and script files" || echo "No changes to commit"
+          git push
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,2 @@
 *.egg-info
 __pycache__
-notebooks
diff --git a/README.md b/README.md
@@ -1,44 +1,40 @@
-Template file and folder structure for data pipelines
+# About this repo
+Template file and folder structure for data pipelines written in python and Jupyter notebooks
 
-The file and code structure of this repo is inspired by the 
+## Acknowledgements
+The repo is inspired by the 
 [The Good Research Code Handbook](https://goodresearch.dev/setup#)
-by Patick Mineault.
-This handbook is a great resource for setting up a coding projects in academic research.
+by Patick Mineault,
+which is a fantastic resource for coding in academic research.
+
+## Workflows and code
+The workflows are stored as Jupyter notebooks in the [`notebooks`](./notebooks) directory.
+The backend code is stored in the [`src`](./src) directory.
+After following the installation instructions,
+the code can be used via `from src import ...`,
+as is done in [`example_workflow.ipynb`](./notebooks/example_workflow.ipynb).
 
 # Installation
-The code in the `src` directory is installed as a local, editable python package.
-It is installed via the python package manager `pip` into some `conda` environment
-and is then accessible via `from src import ...` in any python script.
+The following setup will
+- create a `conda` environment with the packages defined in the [`environment.yml`](./environment.yml) file
+- install the code in the `src` directory as a local python module via `pip`
 
 ## Prerequisites
-### Conda 
-Download and install [miniforge](https://github.com/conda-forge/miniforge)
-(recommended if you do _not_ have Anaconda already installed).
-This gives you a light-weight `conda` installation and access to the
+### conda/mamba
+Download and install [miniforge](https://github.com/conda-forge/miniforge), 
+which gives you a light-weight `conda` installation and access to the
 faster `mamba` package manager.
-
-If you already do have [Anaconda](https://www.anaconda.com/download) installed,
-you can just use this, if you do not want to switch to miniforge.
-
-How to work with `conda` environments can be found in the 
+This is recommended if you do _not_ have Anaconda already installed.
+Otherwise, just use [Anaconda](https://www.anaconda.com/download).
+How to work with `conda` environments is explained in the 
 [`mamba`](https://mamba.readthedocs.io/en/latest/user_guide/mamba.html#)
 or 
 [`conda`](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html)
 user guides.
 
 ### Git
-Download and install [`git`](https://git-scm.com/downloads) for your operating system.
-
-While it is not strictly necessary to use `git clone` to download the code,
-it is recommended because you can easily get the latest updates via `git pull`
-(see below).
-
-## Dependencies
-Dependencies on python packages are are stored in the `environment.yml` file.
-This file is used to create the `conda` environment needed to run the code in this project.
-The packages are typically installed from 
-- the `conda-forge` channel
-- `pypi` for packages not available on `conda-forge`
+Download and install [`git`](https://git-scm.com/downloads) version control.
+`git` is the recommended way to acquire and update the code, but you can also manually download this repo.
 
 ## Installing local code
 Getting the code up and running on a new system is done via the command in the `conda` prompt (adapt URL and name to your repo):
@@ -48,73 +44,83 @@ git clone https://github.com/nspiller/template_data_pipelines
 cd template_data_pipelines 
 
 # create conda environment with necessary dependencies
-conda env create -n template_data_pipelines -f environment.yml
+conda env create -y -n template_data_pipelines -f environment.yml
 conda activate template_data_pipelines
 
 # install project code as local local python module
 pip install -e .
-
-# convert scripts to notebooks
-python src/create_notebooks.py
 ```
 
 If using `mamba`, simply replace `conda` with `mamba` in the above commands.
 
 If `git` is not installed on the system, the code can also be downloaded as a zip file from the github website.
 
 ## Updating the local code
-```
-# update with latest changes
+To update the local code with any changes made to the repo, run
+```bash
 git pull origin main
-
-# convert scripts to notebooks
-python src/create_notebooks.py
+pip install -e .
 ```
 
-Note that will result in an error, if you have modified any file other than those in the `notebooks` folder.
-To revert any changes, use `git status` to see which files have been modified and then `git reset --hard` to revert the changes.
-Then run the `git pull` command again.
-
-If `git` is not installed on your system, you will have to set up a new environment for the udpated code to avoid any conflicts.
-
-# Working with script files
-A convenient way to work with python code is via Jupyter notebooks,
-which allow for interactive code execution and visualization along with 
-documentation in markdown format.
-However, 
-Jupyter notebooks are saved as `JSON` file, which do not work well with git version control.
-Therefore, no Jupyter notebooks, i.e. `.ipynb` files, are stored in this repo.
-Instead, they are stored as `.py` python script files in the [`scripts/`](./scripts/) folder.
-Note that the `.py` files do not contain any output cells.
-
-These script files can be easily opened as Jupyter notebooks.
-This is done via the [`jupytext`](https://jupytext.readthedocs.io/en/latest/index.html)
-python module.
-This way one has the convenience of working with Jupyter notebooks,
-but avoids the problems of version control with `.ipynb` files.
-
-## option 1: Opening script files using the jupytextvscode extension
-If you are working with VSCode,
-the most convenient way to open the `.py` files as Jupyter notebooks is the extension
-[Jupytext for Notebooks (congyiwu)](https://marketplace.visualstudio.com/items?itemName=congyiwu.vscode-jupytext).
-This extension adds the option "Open as Jupyter Notebook" to the `.py` files 
-(right click on the file in the explorer view).
-The `.ipynb` file is never actually stored on disk,
-but any changes made to the `.ipynb` file are directly written to the `.py` file.
-The `.py` file can then be committed to version control.
-
-## option 2: Manually converting between script and notebook files with the jupytext CLI
-`jupytext` can be called from the command line convert between `.ipynb` and `.py` file.
-This [command line script](./src/create_notebooks.py) cycles through all `.py` files in the
-`scripts` folder and creates the corresponding `.ipynb` files in the `notebooks` folder.
+Note that will result in an error, if you have modified any files.
+To revert any changes, use `git status` to see which files have been modified and then `git reset --hard` to revert the changes rerun the above commands.
+
+# For developers
+## Python dependencies
+Necessary python packages including versions are declared in the [`environment.yml`](./environment.yml) file
+and installed upon creating the `conda` environment.
+The packages are typically installed from 
+- the `conda-forge` channel
+- `pypi` for packages not available on `conda-forge`
+
+## Working with Jupyter notebooks
+Jupyter notebooks are a user-friendly way to create and document workflows using a combination of markdown and code cells.
+However, they are saved along with lots of metadata in `JSON` format,
+which is not well suited for version control.
+
+Here, each notebook is paired with a `.py` script file containing identical code/markdown data.
+With this setup, changes to the workflows can be easily tracked in the `.py` files,
+while the user can still work with the notebook files.
+The conversion is done via the [Jupytext](https://jupytext.readthedocs.io/en/latest/) package, which provides multiple ways to convert between `.ipynb` and `.py` files.
 
 ```
-conda activate template_data_pipelines
-cd template_data_pipelines
-python src/create_notebooks.py
+└── notebooks
+    ├── workflow1.ipynb
+    ├── workflow2.ipynb
+    └── scripts
+        ├── workflow1.py
+        └── workflow2.py
+```
+## Jupytext workflows
+### Manual conversion
+The simplest way to convert between `.ipynb` and `.py` files is to use the
+[Jupytext CLI](https://jupytext.readthedocs.io/en/latest/using-cli.html).
+Note that this process can be further streamlined using the
+[Jupytext configuration file](https://jupytext.readthedocs.io/en/latest/config.html)
+(see [`pyproject.toml`](./pyproject.toml)).
+
+This repo also provides a command line tool as a wrapper around `jupytext` to convert between all `.ipynb` and `.py` files.
+The script is called via
 ```
-Note that this is designed for a one-way conversion to merge the latest 
-changes from the repo,
-so any changes in the `.ipynb` file will be overwritten.
-To merge the changes from the `.ipynb` file to the `.py` file, refer to the
-[`jupytext`](https://jupytext.readthedocs.io/en/latest/index.html) documentation
+# compare timestamps and update older
+python src/sync_notebooks.py
+```
+
+Note that it is the user's responsibility to ensure that the `.py` and `.ipynb` files are in sync before committing to version control.
+
+### Automatic conversion
+One can ensure that the `.py` and `.ipynb` files are always up to date using Github Actions. 
+The Actions file [`.github/workflows/sync_notebooks.yml`](./.github/workflows/sync_notebooks.yml) is triggered on every pull request and pushes to the main branch.
+It then syncs notebooks and script files, and adds potiential changes as a new commit to etiher the pull request or directly to the main branch.
+Since the Actions bot is creating commits, it is important to enable write permissions for the bot in the repository settings.
+
+Note that this approach compares notebook and script files based on their commit history, because git does not store file modification times.
+
+### Jupytext extension for VSCode
+When working with VSCode, the extension
+[Jupytext for Notebooks (congyiwu)](https://marketplace.visualstudio.com/items?itemName=congyiwu.vscode-jupytext)
+is recommended.
+This allows you to open a `.py` file as a Jupyter notebook.
+Changes to that notebook are directly written via Jupytext to the `.py` file
+and the `.ipynb` file is never actually stored in the working directory.
+
diff --git a/environment.yml b/environment.yml
@@ -1,12 +1,11 @@
 channels:
   - conda-forge
 dependencies:
-  - python=3.8
+  - python=3.9
   - numpy
-  - scipy
-  - pandas
   - matplotlib
-  - seaborn
+  - pandas
   - jupyter
-  - ipykernel
-  - jupytext
+  - pip
+  - pip:
+    - jupytext~=1.0
diff --git a/notebooks/.gitkeep b/notebooks/.gitkeep
diff --git a/notebooks/example_workflow.ipynb b/notebooks/example_workflow.ipynb
@@ -0,0 +1,87 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f0d30d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from src import computations, plotting"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f26c2ddd",
+   "metadata": {},
+   "source": [
+    "# Example 1\n",
+    "Here we can explain that we are\n",
+    "1. generating an array of random numbers using `numpy`\n",
+    "2. calculating the mean using our custom code in the `src` module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ef311c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate data\n",
+    "data = []\n",
+    "for _ in range(10000):\n",
+    "    arr = np.random.rand(10)\n",
+    "    value = computations.compute_mean(arr)\n",
+    "    data.append(value)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e534715c",
+   "metadata": {},
+   "source": [
+    "Next, we plot the data.\n",
+    "The plot is visible when we run the script file cell-by-cell or as a Jupyter notebook.\n",
+    "We could also run the entire script file top to bottom and save the plot to a file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "781568e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, ax = plotting.plot_distribution(data)\n",
+    "# fig.savefig(\"/some/other/folder/plot.png\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "-all",
+   "custom_cell_magics": "kql"
+  },
+  "kernelspec": {
+   "display_name": "template",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,6 @@
+[tool.jupytext.formats]
+"notebooks/" = "ipynb"
+"notebooks/scripts/" = "py:percent"
+[tool.jupytext]
+notebook_metadata_filter="-all"
+cell_metadata_filter="-all"
diff --git a/scripts/example_workflow.py b/scripts/example_workflow.py
diff --git a/scripts/more_complex_pipeline.py b/scripts/more_complex_pipeline.py
diff --git a/setup.py b/setup.py
@@ -1,3 +1,7 @@
 from setuptools import find_packages, setup
 
-setup(name="src", version="0.1.0", packages=find_packages())
+setup(
+    name="src",
+    version="0.1.0",
+    packages=find_packages()
+    )
diff --git a/src/create_notebooks.py b/src/create_notebooks.py
diff --git a/src/sync_notebooks.py b/src/sync_notebooks.py
@@ -0,0 +1,35 @@
+import subprocess
+from pathlib import Path
+
+
+def sync(notebook_folder, script_folder):
+    """Cycle through Jupyter notebooks and sync with paired Python scripts.
+
+    Parameters
+    ----------
+    notebook_folder : path-like
+        Path to folder where .ipynb files will be saved
+    """
+    notebook_files = Path(notebook_folder).glob("*.ipynb")
+    script_files = Path(script_folder).glob("*.py")
+    files = list(notebook_files) + list(script_files)
+
+    for f in files:
+        command = f"jupytext --sync {f}"
+        print(f">> RUNNING {command}")
+        subprocess.run(command.split())
+
+if __name__ == "__main__":
+
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Convert between Jupyter notebooks and Python scripts using jupytext.",
+        epilog="""
+        Requires pyproject.toml with jupytext configuration. 
+        IMPORTANT: does not handle white spaces in file names
+        """,
+    )
+
+    args = parser.parse_args()
+    sync("notebooks", "notebooks/scripts")