Skip to content

Commit

Permalink
Add GPU-enabled CI (#170)
Browse files Browse the repository at this point in the history
* Add GPU-enabled CI

* Update file reference

* Remove other CI

* Bump runner version

* Try allocating 10 GB

* Debug

* Debug

* Sync CUDA environment with DGL environment

* Debug Torch/CUDA interaction

* Try adding `pytorch-gpu`

* Debug

* tmp add print and trial test

* check dgl

* add torchdata package

* add other torch packages required

* Try bumping to newer DGL channel targeting PyTorch 2.1

* Add back `pytorch-gpu`?

* Revert "tmp add print and trial test"

This reverts commit 6c5f42c.

* Revert more temporary changes, fix coverage

* Syntax

* Debug

* Debug

* Fix

---------

Co-authored-by: Lily Wang <[email protected]>
  • Loading branch information
mattwthompson and lilyminium authored Jan 30, 2025
1 parent 2bc95a0 commit fbcf4f2
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 12 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/base-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,4 @@ jobs:
assert str(RDKIT_AVAILABLE).lower() == 'true'
- name: Run tests
run: |
python -m pytest -n 4 -v --cov=openff/nagl --cov-config=setup.cfg --cov-append --cov-report=xml --color=yes openff/nagl/
run: python -m pytest -n logical --cov=openff/nagl --cov-config=pyproject.toml --cov-append --cov-report=xml --color=yes openff/nagl/
2 changes: 1 addition & 1 deletion .github/workflows/dev-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,5 +106,5 @@ jobs:
- name: Run tests
run: |
python -m pytest -n 4 -v --cov=openff/nagl --cov-config=setup.cfg --cov-append --cov-report=xml --color=yes openff/nagl/
python -m pytest -n 4 -v --cov=openff/nagl --cov-config=pyproject.toml --cov-append --cov-report=xml --color=yes openff/nagl/
8 changes: 3 additions & 5 deletions .github/workflows/gh-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ jobs:
openeye-license-text: ${{ secrets.OE_LICENSE }}
openeye-license-file: ${{ env.OE_LICENSE }}


- name: Uninstall OpenEye
if: matrix.include-openeye == false
run: micromamba remove --force openeye-toolkits --yes || echo "openeye not installed"
Expand Down Expand Up @@ -98,16 +97,15 @@ jobs:
- name: Run tests
run: |
python -m pytest -v --cov=openff/nagl --cov-config=setup.cfg --cov-append --cov-report=xml --color=yes openff/nagl/
python -m pytest -v --cov=openff/nagl --cov-config=pyproject.toml --cov-append --cov-report=xml --color=yes openff/nagl/
- name: codecov
uses: codecov/codecov-action@v4
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: ./coverage.xml
files: ./coverage.xml
verbose: True
# name: codecov-${{ matrix.os }}-py${{ matrix.python-version }}


pylint_check:
runs-on: ubuntu-latest
Expand Down
98 changes: 98 additions & 0 deletions .github/workflows/gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
name: GPU-enabled CI
on:
workflow_dispatch:
pull_request:
branches:
- "main"

defaults:
run:
shell: bash -l {0}

jobs:
start-aws-runner:
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
outputs:
mapping: ${{ steps.aws-start.outputs.mapping }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::649715411074:role/gh-actions-runner-role
aws-region: us-east-1
- name: Create cloud runner
id: aws-start
uses: omsf-eco-infra/[email protected]
with:
provider: "aws"
action: "start"
aws_image_id: ami-0d5079d9be06933e5
aws_instance_type: g4dn.xlarge
# IAM default might be 5 GB?
aws_root_device_size: 125
aws_region_name: us-east-1
aws_home_dir: /home/ubuntu
env:
GH_PAT: ${{ secrets.GH_PAT }}
self-hosted-test:
runs-on: self-hosted
needs:
- start-aws-runner
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Print disk usage
run: "df -h"

- name: Print Docker details
run: "docker version || true"

- name: Setup Conda Environment
uses: mamba-org/setup-micromamba@v2
with:
environment-file: devtools/conda-envs/test_cuda_env.yaml

- name: Install Package
run: python -m pip install . --no-deps

- name: Double-check local installation
run: python -c "from openff.nagl import __version__; print(__version__)"

- name: Check that PyTorch can see CUDA
run: python -c "import torch; assert torch.cuda.is_available()"

- name: Check we can see DGL
run: python -c "import dgl; print(dgl.__version__)"

- name: Run tests
run: python -m pytest -n 4 -v --cov=openff/nagl --cov-config=pyproject.toml --cov-append --cov-report=xml --color=yes openff/nagl/

stop-aws-runner:
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
needs:
- start-aws-runner
- self-hosted-test
if: ${{ always() }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::649715411074:role/gh-actions-runner-role
aws-region: us-east-1
- name: Stop instances
uses: omsf-eco-infra/[email protected]
with:
provider: "aws"
action: "stop"
instance_mapping: ${{ needs.start-aws-runner.outputs.mapping }}
aws_region_name: us-east-1
env:
GH_PAT: ${{ secrets.GH_PAT }}
12 changes: 8 additions & 4 deletions devtools/conda-envs/test_cuda_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name: openff-nagl-test-cuda
channels:
- openeye
- dglteam/label/cu117
- dglteam/label/th21_cu118
- pytorch
- nvidia
- conda-forge
Expand All @@ -29,10 +29,14 @@ dependencies:

# gcn
- cudatoolkit
- dgl ==1.1.2
- pytorch >=2.0
- dgl ~=2.1
- torchdata
- torchvision
- torchaudio
- pytorch ==2.1
- pytorch-gpu # is effectively a subpackage?
- pytorch-lightning
- pytorch-cuda ==11.7
- pytorch-cuda ==11.8

# parallelism
- dask-jobqueue
Expand Down

0 comments on commit fbcf4f2

Please sign in to comment.