Refactor for Dataflow runner (#3)

* test packaging repo for deployment * dockerize main pipeline * dockerize and deploy model server * update pytest workflow to reflect refactored requirements * only test tests/ dir * containerize pipeline * change is_local -> filesystem * add make target for python venv setup * add more makefile targets * accept port from env var * remove expose from model server doccker file * hardcode default cloud run port * specify platform on make build + default to port in config * make pipeline worker image * fix Dockerfile for pipeline * move stages to home in pipeline worker Dockerfile * update datetime storing * test limit on model server -> max batch duration = 60 sec * higher throughput resources for faster dataflow runs * add setup and teardown calls to init and deduplicating data * update docs * test push model server bucket * rm double run of push-deploy * no need to install for docker builds * bump actions/checkout@4 * add similar job for pipeline worker * add publish make target * make dokcerhub username public, so the final image location is visible in the logs * refactor publish job to separate actions workflow * update version to first offical release
pmhalvor · Oct 20, 2024 · c42cc6d · c42cc6d
1 parent 68e83b4
commit c42cc6d
Show file tree

Hide file tree

Showing 36 changed files with 825 additions and 105 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,55 @@
+name: Build & Push Images to Google Artifact Registry
+
+on:
+  push:
+    branches:
+      - main  # Runs on push to the main branch
+  pull_request: 
+    branches:
+      - main  # Runs on pull requests to the main branch
+
+
+env:
+  PROJECT_ID: ${{ secrets.PROJECT_ID }}
+  SERVICE_ACCOUNT: ${{ secrets.SERVICE_ACCOUNT }}
+  WORKLOAD_IDENTITY_PROVIDER: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }}
+  MODEL_REGISTERY: ${{ secrets.MODEL_REGISTERY }}
+
+jobs:
+  build-push-images:
+    name: Build and Push Images
+    permissions:
+      contents: 'read'
+      id-token: 'write'
+
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Google Auth
+        id: auth
+        uses: 'google-github-actions/auth@v2'
+        with:
+          token_format: 'access_token'
+          project_id: ${{ env.PROJECT_ID }}
+          service_account: ${{ env.SERVICE_ACCOUNT }}
+          workload_identity_provider: ${{ env.WORKLOAD_IDENTITY_PROVIDER }}
+
+      - name: Docker Auth
+        id: docker-auth
+        uses: 'docker/login-action@v1'
+        with:
+          username: 'oauth2accesstoken'
+          password: '${{ steps.auth.outputs.access_token }}'
+          registry: '${{ env.MODEL_REGISTERY }}'
+
+      - name: Build and Push Model Server
+        run: make build-push-model-server
+        env:
+          MODEL_REGISTERY: ${{ env.MODEL_REGISTERY }}
+
+      - name: Build and Push Pipeline Worker
+        run: make build-push-pipeline-worker
+        env:
+          MODEL_REGISTERY: ${{ env.MODEL_REGISTERY }}
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,39 @@
+name: Publish Latest Images to Docker Hub
+
+on:
+  push:
+    branches:
+      - main  # Runs on push to the main branch
+
+
+env:
+  MODEL_REGISTERY: ${{ secrets.MODEL_REGISTERY }}
+  DOCKERHUB_USERNAME: ${{ vars.DOCKERHUB_USERNAME }}
+  DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
+
+jobs:
+  publish-model-server:
+    name: Publish Latest Images
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Docker Auth (Docker Hub)
+        id: docker-auth-dockerhub
+        uses: docker/login-action@v3
+        with:
+          username: ${{env.DOCKERHUB_USERNAME}}
+          password: ${{env.DOCKERHUB_TOKEN}}
+
+      - name: Publish Latest Model Server Image
+        run: make publish-latest-model-server
+        env:
+          MODEL_REGISTERY: ${{ env.MODEL_REGISTERY }}
+          PUBLIC_MODEL_REGISTERY: docker.io/${{ env.DOCKERHUB_USERNAME }}
+
+      - name: Publish Latest Pipeline Worker Image
+        run: make publish-latest-pipeline-worker
+        env:
+          MODEL_REGISTERY: ${{ env.MODEL_REGISTERY }}
+          PUBLIC_MODEL_REGISTERY: docker.io/${{ env.DOCKERHUB_USERNAME }}
diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml
@@ -28,8 +28,8 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        if [ -f requirements/requirements.txt ]; then pip install -r requirements/requirements.txt; fi
 
     - name: Run tests with pytest
       run: |
-        python -m pytest
+        python -m pytest tests/
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ plots/
 model/
 hide/
 table/
+env.sh 
 
 # Python basic ignores
 # Byte-compiled / optimized / DLL files

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,16 @@
+FROM apache/beam_python3.11_sdk
+
+COPY src/ /home/src/
+COPY requirements/requirements.txt /home/requirements.txt
+COPY data/geo /home/data/geo
+
+WORKDIR /home
+
+# Install HDF5 using apt
+RUN apt-get update && apt-get install -y \
+    libhdf5-dev \
+    libsndfile1 \
+    gcc
+RUN pip install -r requirements.txt
+
+ENTRYPOINT ["python3", "src/pipeline.py"]
diff --git a/Dockerfile.model-server b/Dockerfile.model-server
@@ -0,0 +1,12 @@
+FROM python:3.11-slim-bullseye
+
+COPY src/model_server.py /home/src/model_server.py
+COPY src/config.py /home/src/config.py
+COPY src/config/ /home/src/config/
+COPY requirements/model-requirements.txt /home/requirements.txt
+
+WORKDIR /home
+
+RUN pip install -r requirements.txt
+
+CMD [ "python", "src/model_server.py" ]
diff --git a/Dockerfile.pipeline-worker b/Dockerfile.pipeline-worker
@@ -0,0 +1,14 @@
+FROM apache/beam_python3.11_sdk
+
+COPY src/ /home
+COPY requirements/requirements.txt /home/requirements.txt
+COPY data/geo /home/data/geo
+
+WORKDIR /home
+
+# Install HDF5 using apt
+RUN apt-get update && apt-get install -y \
+    libhdf5-dev \
+    libsndfile1 \
+    gcc
+RUN pip install -r requirements.txt
diff --git a/README.md b/README.md
@@ -6,9 +6,57 @@ Derived from <a href="https://docs.mbari.org/pacific-sound/notebooks/humpbackwha
 </sub>
 
 
+## Pipeline description
+
+Stages:
+1. **Input**: When (and where*) to look for whale encounters on [HappyWhale](https://happywhale.com/).
+2. **Geometry Search**: Query [open-oceans/happywhale](https://github.com/open-oceans/happywhale) to find potential whale encounters. 
+
+   &rarr; Expected outputs: encounter ids, start and end times, and longitude and latitude.
+
+3. **Retrive Audio**: Download audio from MBARI's [Pacific Ocean Sound Recordings](https://registry.opendata.aws/pacific-sound/) around the time of the encounter. 
+
+    &rarr; Expected outputs: audio array, start and end times, and encounter ids.
+
+4. **Parse Audio**: Break audio into non-overlaping segments with flagged frequency detections. 
+
+    &rarr; Expected outputs: cut audio array, detection intervals, and encounter ids.
+
+5. **Classify Audio**: Use a NOAA and Google's [humpback_whale model](https://tfhub.dev/google/humpback_whale/1) to classify the flagged segments.
+
+    &rarr; Expected outputs: resampled audio, classification score array, and encounter ids.
+
+6. **Postprocess Labels**: Build clip-intervals for each encounter for playback snippets.
+
+    &rarr; Expected outputs: encounter ids, cut/resampled audio array, and aggregated classification score.
+
+7. **Output**: Map the whale encounter ids to the playback snippets.
+
+<!-- Light mode -->
+[![](https://mermaid.ink/img/pako:eNpVkttOwkAQhl9lMleaFIJFTo0x4SBIItGoV1ouhnZKm2y7ZA9oJby7S1uJzNX-s98cMweMZMwYYCLkV5SSMvA-CwtwNv5MKEioFZHgIiYFy2JnjV5Dq3UPk6v6cyvkhmHBMmejSnhjUlF63SSoyGlD5lZnEbw6LNszjG2cyYab1FwtppV4aIKSTBhW8EJKX8Y8VNi8wTZWCM0lw1SQ1llSXrBzuGu3Hb1saCU30sDKzS1cw2rP6gyeki4azNAWXqQ2OyUj1hqeaMNCN-iiQh8__9rUKTxb4_az_j_TAk4KPcxZ5ZTFbs-HkydEk3LOIQbuGXNCVpgQw-LoULJGvpVFhIFRlj1U0m5TdFXchB7aXUyGZxltFeVn746KDykvNAYH_Mag2_HbN0P_ZtjvjUa3frff9bDEoHP08KeK6LRHtQ38nt8b3A4HHnKcGalW9WFU93H8BWH3qDQ?type=png)](https://mermaid.live/edit#pako:eNpVkttOwkAQhl9lMleaFIJFTo0x4SBIItGoV1ouhnZKm2y7ZA9oJby7S1uJzNX-s98cMweMZMwYYCLkV5SSMvA-CwtwNv5MKEioFZHgIiYFy2JnjV5Dq3UPk6v6cyvkhmHBMmejSnhjUlF63SSoyGlD5lZnEbw6LNszjG2cyYab1FwtppV4aIKSTBhW8EJKX8Y8VNi8wTZWCM0lw1SQ1llSXrBzuGu3Hb1saCU30sDKzS1cw2rP6gyeki4azNAWXqQ2OyUj1hqeaMNCN-iiQh8__9rUKTxb4_az_j_TAk4KPcxZ5ZTFbs-HkydEk3LOIQbuGXNCVpgQw-LoULJGvpVFhIFRlj1U0m5TdFXchB7aXUyGZxltFeVn746KDykvNAYH_Mag2_HbN0P_ZtjvjUa3frff9bDEoHP08KeK6LRHtQ38nt8b3A4HHnKcGalW9WFU93H8BWH3qDQ)
+
+<!-- Dark mode -->
+<!-- [![](https://mermaid.ink/img/pako:eNpVkttOwkAQhl9lMleaFALl3BgTzpJIJOKVlIttO6WN2y7ZA1oJ7-7SViNztf_sN8fMGUMREXoYc_EZJkxqeJv5OVgb72LmxawRMk55xCSs8qPRag-NxiNM7qrPAxcBwZJERloWsCUmw-S-TlCS05rMjEpDeLVYeiIYmygVNTepuEpMSzGvg-KUa5KwYVLdxsxLbFFjgeFcUUEw5UypNC5u2AU8NJuWXtW0FIHQsLZzc9uwPJH8A69JlzWm2QE2QumjFCEpBc8sIK5qdFmiT7vfNlUCL0bb_ez_z7SEq0IHM5IZSyO75_PV46NOKCMfPfu0q_3w0c8vlmNGi22Rh-hpachBKcwhQVvCjuegOUZM0yxlB8myP--R5e9C3Gj0zviFXqflNttDtz3s90ajrtvpdxws0GtdHPwuI1rNUWUDt-f2Bt3hwEGKUi3kurqK8jguPzO5pvE?type=png)](https://mermaid.live/edit#pako:eNpVkttOwkAQhl9lMleaFALl3BgTzpJIJOKVlIttO6WN2y7ZA1oJ7-7SViNztf_sN8fMGUMREXoYc_EZJkxqeJv5OVgb72LmxawRMk55xCSs8qPRag-NxiNM7qrPAxcBwZJERloWsCUmw-S-TlCS05rMjEpDeLVYeiIYmygVNTepuEpMSzGvg-KUa5KwYVLdxsxLbFFjgeFcUUEw5UypNC5u2AU8NJuWXtW0FIHQsLZzc9uwPJH8A69JlzWm2QE2QumjFCEpBc8sIK5qdFmiT7vfNlUCL0bb_ez_z7SEq0IHM5IZSyO75_PV46NOKCMfPfu0q_3w0c8vlmNGi22Rh-hpachBKcwhQVvCjuegOUZM0yxlB8myP--R5e9C3Gj0zviFXqflNttDtz3s90ajrtvpdxws0GtdHPwuI1rNUWUDt-f2Bt3hwEGKUi3kurqK8jguPzO5pvE) -->
+
+
+
+
+<sub>
+*Currently only support encounters around the Monterey Bay Hydrophone (<a href="https://www.mbari.org/technology/monterey-accelerated-research-system-mars/">MARS</a>).
+</sub>
+
+<br>
+
 ## Getting started
 
 ### Install
+
+Create a virtual environment and install the required packages.
+We'll use conda for this, but you can use any package manager you prefer.
+
+Since we're developing on an M1 machine, we'll need to specify the `CONDA_SUBDIR` to `osx-arm64`.
+This step should be adapted based on the virtual environment you're using.
+
 #### M1:
 ```bash
 CONDA_SUBDIR=osx-arm64 conda create -n whale-speech python=3.11
@@ -24,53 +72,93 @@ conda activate whale-speech
 pip install -r requirements.txt
 ```
 
+### Google Cloud SDK
+To run the pipeline on Google Cloud Dataflow, you'll need to install the Google Cloud SDK.
+You can find the installation instructions [here](https://cloud.google.com/sdk/docs/install).
+
+Make sure you authentication your using and initialize the project you are using.
+```bash
+gcloud auth login
+gcloud init
+```
+
+For newly created projects, each of the services used will need to be enabled. 
+This can be easily done in the console, or via the command line. 
+For example:
+```bash
+gcloud services enable bigquery.googleapis.com
+gcloud services enable dataflow.googleapis.com
+gcloud services enable storage-api.googleapis.com
+gcloud services enable run.googleapis.com
+```
+
 ### Run locally 
+To run the pipeline and model server locally, you can use the `make` target `local-run`.
+
 ```bash
 make local-run
 ```
 
-## Pipeline description
+This target starts by killing any previous model servers that might be running (needed for when a pipeline fails, without tearing down the server, causing the previous call to hang). 
+Then it starts the model server in the background and runs the pipeline.
 
-Stages:
-1. **Input**: When (and where*) to look for whale encounters on [HappyWhale](https://happywhale.com/).
-2. **Geometry Search**: Query [open-oceans/happywhale](https://github.com/open-oceans/happywhale) to find potential whale encounters. 
 
-   &rarr; Expected outputs: encounter ids, start and end times, and longitude and latitude.
+### Build and push the model server
+To build and push the model server to your model registry (stored as an environment variable), you can use the following `make` target.
 
-3. **Retrive Audio**: Download audio from MBARI's [Pacific Ocean Sound Recordings](https://registry.opendata.aws/pacific-sound/) around the time of the encounter. 
-    
-    &rarr; Expected outputs: audio array, start and end times, and encounter ids.
-    
-4. **Filter Frequency**: Break audio into non-overlaping segments with flagged frequency detections. 
-        
-    &rarr; Expected outputs: cut audio array, detection intervals, and encounter ids.
+```bash
+make build-push-model-server
+```
+This target builds the model server image and pushes it to the registry specified in the `env.sh` file.
+The tag is a combination of the version set in the makefile and the last git commit hash. 
+This helps keep track of what is included in the image, and allows for easy rollback if needed.
+The target fails if there are any uncommited changes in the git repository.
 
-5. **Classify Audio**: Use a NOAA and Google's [humpback_whale model](https://tfhub.dev/google/humpback_whale/1) to classify the flagged segments.
+The `latest` tag is only added to images deployed via GHA.
 
-    &rarr; Expected outputs: resampled audio, classification score array, and encounter ids.
+### Run pipeline with Dataflow
+To run the pipeline on Google Cloud Dataflow, you can use the following `make` target.
 
-6. **Postprocess Labels**: Build clip-intervals for each encounter for playback snippets.
+```bash
+make run-dataflow
+```
+Logging in the terminal will tell you the status of the pipeline, and you can follow the progress in the [Dataflow console](https://console.cloud.google.com/dataflow/jobs).
 
-    &rarr; Expected outputs: encounter ids, cut/resampled audio array, and aggregated classification score.
+In addition to providing the inference url and filesystem to store outputs on, the definition of the above target also provides an example on how a user can pass additional arguments to and request different resources for the pipeline run. 
 
-7. **Output**: Map the whale encounter ids to the playback snippets.
+**Pipeline specific parameters**
+You can configure all the paramters set in the config files directly when running the pipeline.
+The most important here is probably the start and end time for the initial search. 
 
-<!-- Light mode -->
-[![](https://mermaid.ink/img/pako:eNpVkl1PwjAUhv_KybnSZBAYIrAYE0VBE41GvNJxUbqzrUm3Yj_USfjvlq0aPTfte_qcz3SHXGWECeZSffCSaQvPV2kN3i5ec5bkrMeZpDpjGm7rrbNmDb3eOVwedY-FVBuCJamKrG5gRUzz8jgkaMl5ICtnBIcnj4l3gguXCRW4y47rxLwV1yEoF9KShkU4NL05qnkTAq9bdhHYjZPSUEMwl8wYkTf_iizgrN_39G2gtdooC_d-eOm71u-kf8FD0mXALCvgURm71YqTMXDHNiRNQJctevP606sp4cFZv6T138Fu4KAwwop0xUTml707eFK0JVWUYuKvGeXMSZtiWu89ypxVq6bmmFjtKEKtXFGir-InjNBtM2bpSrBCs-rXu2X1i1L_NCY7_MRkNIj7w2k8nJ6OZ7OTeHQ6irDBZLCP8KuNGPRnnU3icTyenEwnEVImrNL33e_gqs5Fgftv_9OrOw?type=png)](https://mermaid.live/edit#pako:eNpVkl1PwjAUhv_KybnSZBAYIrAYE0VBE41GvNJxUbqzrUm3Yj_USfjvlq0aPTfte_qcz3SHXGWECeZSffCSaQvPV2kN3i5ec5bkrMeZpDpjGm7rrbNmDb3eOVwedY-FVBuCJamKrG5gRUzz8jgkaMl5ICtnBIcnj4l3gguXCRW4y47rxLwV1yEoF9KShkU4NL05qnkTAq9bdhHYjZPSUEMwl8wYkTf_iizgrN_39G2gtdooC_d-eOm71u-kf8FD0mXALCvgURm71YqTMXDHNiRNQJctevP606sp4cFZv6T138Fu4KAwwop0xUTml707eFK0JVWUYuKvGeXMSZtiWu89ypxVq6bmmFjtKEKtXFGir-InjNBtM2bpSrBCs-rXu2X1i1L_NCY7_MRkNIj7w2k8nJ6OZ7OTeHQ6irDBZLCP8KuNGPRnnU3icTyenEwnEVImrNL33e_gqs5Fgftv_9OrOw)
+```bash
+		--start "2024-07-11" \
+		--end "2024-07-11" \
+		--offset 0 \
+		--margin 1800 \
+		--batch_duration 60 
+```
 
-<!-- Dark mode -->
-<!-- [![](https://mermaid.ink/img/pako:eNpVkNtOwzAMhl8l8lWRumnrzrlA2qkbEhKIcQXdRUjdNaJNSg5AmfbuZF2FmK_8-_ud2D4CVykChaxQXzxn2pLnVSKJj_lrxmjGOpwVKFOmyZ2snDV70unckkWwQVWi1TXZIdM8v2mbGroMnjwSn0jmLhWqZYsLu4hlI9ZBLAqLmsQaPxxKXrfedYPjYFkwY0RWXz0UN3ATPCpjK604GkPu2RsWpjVsGsO2XSATJicPzvrp9_8n2Z4FhFCiLplI_RGO50oCNscSE6A-9Xu_J5DIk_cxZ9Wulhyo1Q5D0ModcvA_FMYrV6XM4kqwg2blX7Vi8kWpKw30CN9AB72o259G_el4NJsNo8F4EEINtHcK4afp6HVnl5hEo2g0GU4np1-Ax4B-?type=png)](https://mermaid.live/edit#pako:eNpVkNtOwzAMhl8l8lWRumnrzrlA2qkbEhKIcQXdRUjdNaJNSg5AmfbuZF2FmK_8-_ud2D4CVykChaxQXzxn2pLnVSKJj_lrxmjGOpwVKFOmyZ2snDV70unckkWwQVWi1TXZIdM8v2mbGroMnjwSn0jmLhWqZYsLu4hlI9ZBLAqLmsQaPxxKXrfedYPjYFkwY0RWXz0UN3ATPCpjK604GkPu2RsWpjVsGsO2XSATJicPzvrp9_8n2Z4FhFCiLplI_RGO50oCNscSE6A-9Xu_J5DIk_cxZ9Wulhyo1Q5D0ModcvA_FMYrV6XM4kqwg2blX7Vi8kWpKw30CN9AB72o259G_el4NJsNo8F4EEINtHcK4afp6HVnl5hEo2g0GU4np1-Ax4B-) -->
+Note that any parameters with the same name under different sections will only be updated if its the last section in the list. 
+Also, since these argparse-parameters are added automatically, behavior of boolean flags might be unexpected (always true is added). 
+<!-- TODO fix behavior of boolean in-line parameters -->
 
+**Compute resources**
+The default compute resources are quite small and slow. To speed things up, you can request more workers and a larger machine type. For more on Dataflow resources, check out [the docs](https://cloud.google.com/dataflow/docs/reference/pipeline-options#worker-level_options).
+```
+		--worker_machine_type=n1-highmem-8 \
+		--disk_size_gb=100 \
+		--num_workers=8 \
+		--max_num_workers=8 \
+```
 
 
+Note, you may need to configure IAM permissions to allow Dataflow Runners to access images in your Artifact Registry. Read more about that [here](https://cloud.google.com/dataflow/docs/concepts/security-and-permissions).
 
-<sub>
-*Currently only support encounters around the Monterey Bay Hydrophone (<a href="https://www.mbari.org/technology/monterey-accelerated-research-system-mars/">MARS</a>).
-</sub>
 
 ## Resources 
 - [HappyWhale](https://happywhale.com/)
 - [open-oceans/happywhale](https://github.com/open-oceans/happywhale)
 - [MBARI's Pacific Ocean Sound Recordings](https://registry.opendata.aws/pacific-sound/)
 - [NOAA and Google's humpback_whale model](https://tfhub.dev/google/humpback_whale/1)
 - [Monterey Bay Hydrophone MARS](https://www.mbari.org/technology/monterey-accelerated-research-system-mars/)
+- [Google Cloud Console](https://console.cloud.google.com/)
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ plots/ @@
     model/
     hide/
     table/
+    env.sh
     # Python basic ignores
     # Byte-compiled / optimized / DLL files
@@ Expand Down @@