[WIP] Add new GPU runner for E2E tests

Use previous GPU runner for unit tests Signed-off-by: Nathan Weinberg <[email protected]>
instructlab · Jul 16, 2024 · ab0af26 · ab0af26
1 parent ae6097f
commit ab0af26
Show file tree

Hide file tree

Showing 10 changed files with 300 additions and 85 deletions.
diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: E2E (NVIDIA A10G x1)
+
+on:
+  workflow_dispatch:
+    inputs:
+      pr_or_branch:
+        description: 'pull request number or branch name'
+        required: true
+        default: 'main'
+
+jobs:
+  start-runner:
+    name: Start external EC2 runner
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ami-00c51d9c1374eda97
+          ec2-instance-type: g5.2xlarge
+          subnet-id: subnet-02d230cffd9385bd4
+          security-group-id: sg-06300447c4a5fbef3
+          iam-role-name: instructlab-ci-runner
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-ci-github-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
+            ]
+
+  e2e:
+    name: E2E Test
+    needs: start-runner
+    runs-on: ${{ needs.start-runner.outputs.label }}
+
+    permissions:
+      pull-requests: write
+
+    steps:
+      - name: Checkout instructlab/eval
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Checkout instructlab/instructlab
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          repository: "instructlab/instructlab"
+          path: "instructlab"
+          fetch-depth: 0
+
+      - name: Determine if pr_or_branch is a PR number
+        id: check_pr
+        run: |
+          nvidia-smi
+          if [[ "${{ github.event.inputs.pr_or_branch }}" =~ ^[0-9]+$ ]]; then
+            echo "is_pr=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_pr=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Check if gh cli is installed
+        id: gh_cli
+        run: |
+          if command -v gh &> /dev/null ; then
+            echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Install gh CLI
+        if: steps.gh_cli.outputs.gh_cli_installed == 'false'
+        run: |
+          sudo dnf install 'dnf-command(config-manager)' -y
+          sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
+          sudo dnf install gh --repo gh-cli -y
+
+      - name: test gh CLI
+        run: |
+          gh --version
+
+      - name: set default repo
+        run: |
+          gh repo set-default ${{ github.server_url }}/${{ github.repository }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Fetch and checkout PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr checkout ${{ github.event.inputs.pr_or_branch }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout branch
+        if: steps.check_pr.outputs.is_pr == 'false'
+        run: |
+          git checkout ${{ github.event.inputs.pr_or_branch }}
+
+      - name: Install Packages
+        run: |
+          cat /etc/os-release
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+
+      - name: Install ilab
+        run: |
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRRY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+          export PATH="$PATH:$CUDA_HOME/bin"
+          python3.11 -m venv venv
+          . venv/bin/activate
+          nvidia-smi
+          sed 's/\[.*\]//' requirements.txt > constraints.txt
+          python3.11 -m pip cache remove llama_cpp_python
+          CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3.11 -m pip install --force-reinstall --no-binary llama_cpp_python -c constraints.txt llama_cpp_python
+          python3.11 -m pip install bitsandbytes
+
+          # TODO This should be added to instructlab-training
+          python3.11 -m pip install packaging wheel
+
+          python3.11 -m pip install instructlab-training[cuda]
+
+          # Install the local version of eval before installing the CLI so PR changes are included
+          python3.11 -m pip install .
+
+          python3.11 -m pip install instructlab
+
+      - name: Run e2e test
+        run: |
+          nvidia-smi
+          # This env variable is used on GPUs with less vRAM. It only allows cuda to alloc small chunks of vRAM at a time, usually helping to avoid OOM errors.
+          # This is not a good solution for production code, as setting ENV variables for users isn't best practice. However, it is a helpful manual workaround.
+          export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6,max_split_size_mb:128
+          . venv/bin/activate
+          # TODO: for some reason we need to reinstall DS in order to get fused adam support
+          # This means we need to manually rm and re-install a bunch of packages. Investigate why this is.
+          python3.11 -m pip uninstall -y deepspeed
+
+          python3.11 -m pip cache purge
+
+          DS_BUILD_CPU_ADAM=1 BUILD_UTILS=1 python3.11 -m pip install deepspeed
+
+          nvidia-smi
+
+          python3.11 -m pip show nvidia-nccl-cu12
+          
+          cd instructlab
+          ./scripts/basic-workflow-tests.sh -em
+
+      - name: Add comment to PR if the workflow failed
+        if: failure() && steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR if the workflow succeeded
+        if: success() && steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  stop-runner:
+    name: Stop external EC2 runner
+    needs:
+      - start-runner
+      - e2e
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/e2e.yml → .github/workflows/test.yml b/.github/workflows/e2e.yml → .github/workflows/test.yml
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-name: E2E test
+name: Test
 
 on:
   push:
@@ -11,7 +11,7 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements*.txt'
-      - '.github/workflows/e2e.yml'
+      - '.github/workflows/test.yml'
   pull_request:
     branches:
       - "main"
@@ -20,19 +20,15 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements*.txt'
-      - '.github/workflows/e2e.yml'
+      - '.github/workflows/test.yml'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 jobs:
-  e2e:
+  unit:
     runs-on: ubuntu-gpu
-
-    permissions:
-      pull-requests: write
-
     steps:
       # No step-security/harden-runner since this is a self-hosted runner
       - name: Checkout instructlab/eval
@@ -41,14 +37,23 @@ jobs:
           # https://github.com/actions/checkout/issues/249
           fetch-depth: 0
 
+      # this is needed for branch tests
+      - name: Checkout instructlab/taxonomy
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          repository: "instructlab/taxonomy"
+          path: "taxonomy"
+          fetch-depth: 0
+
+      # this is needed for judge_answer tests
       - name: Checkout instructlab/instructlab
         uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           repository: "instructlab/instructlab"
           path: "instructlab"
           fetch-depth: 0
 
-      - name: Install Packages
+      - name: Install system packages
         run: |
           sudo apt-get install -y cuda-toolkit git cmake build-essential virtualenv
           nvidia-smi
@@ -67,34 +72,35 @@ jobs:
         run: |
           pip cache remove llama_cpp_python
 
-      - name: Cache huggingface
-        uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
-        with:
-          path: ~/.cache/huggingface
-          # config contains DEFAULT_MODEL
-          key: huggingface-${{ hashFiles('src/instructlab/configuration.py') }}
+      - name: Install dependencies
+        run: |
+          python3.11 -m venv venv
+          . venv/bin/activate
+          python3.11 -m pip install .
+          python3.11 -m pip install pytest
 
-      - name: Install instructlab and instructlab-eval
+      - name: Start inference server
         run: |
           export PATH="/home/runner/.local/bin:/usr/local/cuda/bin:$PATH"
-          python3 -m venv venv
-          . venv/bin/activate
           cd instructlab
+          python3.11 -m venv cli_venv
+          . cli_venv/bin/activate
           sed 's/\[.*\]//' requirements.txt > constraints.txt
-          python3 -m pip cache remove llama_cpp_python
+          python3.11 -m pip cache remove llama_cpp_python
           CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --no-binary llama_cpp_python -c constraints.txt llama_cpp_python
           # needed for --4-bit-quant option to ilab train
-          python3 -m pip install bitsandbytes
+          python3.11 -m pip install bitsandbytes
           # install instructlab
-          python3 -m pip install .
-          cd ..
-          # Install instructlab-eval
-          python3 -m pip install .
+          python3.11 -m pip install .
+          # start llama-cpp server
+          ilab model download --repository instructlab/granite-7b-lab-GGUF --filename granite-7b-lab-Q4_K_M.gguf
+          ilab model serve --model-path models/granite-7b-lab-Q4_K_M.gguf
 
-      - name: Run e2e test
+      - name: Run unit tests
         run: |
+          export INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=5
           . venv/bin/activate
-          ./instructlab/scripts/basic-workflow-tests.sh -cm
+          python3.11 -m pytest
 
       - name: Remove llama-cpp-python from cache
         if: always()

diff --git a/.gitignore b/.gitignore
@@ -53,6 +53,7 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 cover/
+eval_output/
 
 # Translations
 *.mo

diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # eval
 
 ![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main)
+![Test](https://github.com/instructlab/eval/actions/workflows/test.yml/badge.svg?branch=main)
 ![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main)
 ![Release](https://img.shields.io/github/v/release/instructlab/eval)
 ![License](https://img.shields.io/github/license/instructlab/eval)

diff --git a/tests/test_branch_gen_answers.py b/tests/test_branch_gen_answers.py
@@ -1,10 +1,12 @@
 # First Party
 from instructlab.eval.mt_bench import MTBenchBranchEvaluator
 
-mt_bench_branch = MTBenchBranchEvaluator(
-    "instructlab/granite-7b-lab",
-    "instructlab/granite-7b-lab",
-    "../taxonomy",
-    "main",
-)
-mt_bench_branch.gen_answers("http://localhost:8000/v1")
+
+def test_branch_gen_answers():
+    mt_bench_branch = MTBenchBranchEvaluator(
+        "instructlab/granite-7b-lab",
+        "instructlab/granite-7b-lab",
+        "taxonomy",
+        "main",
+    )
+    mt_bench_branch.gen_answers("http://localhost:8000/v1")