[WIP] Add GPU runner job

Signed-off-by: Nathan Weinberg <[email protected]>
instructlab · Jul 16, 2024 · 1c17a00 · 1c17a00
1 parent ae6097f
commit 1c17a00
Show file tree

Hide file tree

Showing 8 changed files with 255 additions and 55 deletions.
diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: E2E (NVIDIA A10G x1)
+
+on:
+  workflow_dispatch:
+    inputs:
+      pr_or_branch:
+        description: 'pull request number or branch name'
+        required: true
+        default: 'main'
+
+jobs:
+  start-runner:
+    name: Start external EC2 runner
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ami-00c51d9c1374eda97
+          ec2-instance-type: g5.2xlarge
+          subnet-id: subnet-02d230cffd9385bd4
+          security-group-id: sg-06300447c4a5fbef3
+          iam-role-name: instructlab-ci-runner
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-ci-github-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
+            ]
+
+  e2e:
+    name: E2E Test
+    needs: start-runner
+    runs-on: ${{ needs.start-runner.outputs.label }}
+
+    permissions:
+      pull-requests: write
+
+    steps:
+      - name: Checkout instructlab/eval
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Determine if pr_or_branch is a PR number
+        id: check_pr
+        run: |
+          nvidia-smi
+          if [[ "${{ github.event.inputs.pr_or_branch }}" =~ ^[0-9]+$ ]]; then
+            echo "is_pr=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_pr=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Check if gh cli is installed
+        id: gh_cli
+        run: |
+          if command -v gh &> /dev/null ; then
+            echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Install gh CLI
+        if: steps.gh_cli.outputs.gh_cli_installed == 'false'
+        run: |
+          sudo dnf install 'dnf-command(config-manager)' -y
+          sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
+          sudo dnf install gh --repo gh-cli -y
+
+      - name: test gh CLI
+        run: |
+          gh --version
+
+      - name: set default repo
+        run: |
+          gh repo set-default ${{ github.server_url }}/${{ github.repository }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Fetch and checkout PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr checkout ${{ github.event.inputs.pr_or_branch }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout branch
+        if: steps.check_pr.outputs.is_pr == 'false'
+        run: |
+          git checkout ${{ github.event.inputs.pr_or_branch }}
+
+      - name: Install Packages
+        run: |
+          cat /etc/os-release
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+
+      - name: Install ilab
+        run: |
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRRY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+          export PATH="$PATH:$CUDA_HOME/bin"
+          python3.11 -m venv venv
+          . venv/bin/activate
+          nvidia-smi
+          sed 's/\[.*\]//' requirements.txt > constraints.txt
+          python3.11 -m pip cache remove llama_cpp_python
+          CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3.11 -m pip install --force-reinstall --no-binary llama_cpp_python -c constraints.txt llama_cpp_python
+          python3.11 -m pip install bitsandbytes
+
+          # TODO This should be added to instructlab-training
+          python3.11 -m pip install packaging wheel
+
+          python3.11 -m pip install instructlab-training[cuda]
+
+          # Install the local version of eval before installing the CLI so PR changes are included
+          python3.11 -m pip install .
+
+          python3.11 -m pip install instructlab
+
+      - name: Run e2e test
+        run: |
+          nvidia-smi
+          # This env variable is used on GPUs with less vRAM. It only allows cuda to alloc small chunks of vRAM at a time, usually helping to avoid OOM errors.
+          # This is not a good solution for production code, as setting ENV variables for users isn't best practice. However, it is a helpful manual workaround.
+          export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6,max_split_size_mb:128
+          . venv/bin/activate
+          # TODO: for some reason we need to reinstall DS in order to get fused adam support
+          # This means we need to manually rm and re-install a bunch of packages. Investigate why this is.
+          python3.11 -m pip uninstall -y deepspeed
+
+          python3.11 -m pip cache purge
+
+          DS_BUILD_CPU_ADAM=1 BUILD_UTILS=1 python3.11 -m pip install deepspeed
+
+          nvidia-smi
+
+          python3.11 -m pip show nvidia-nccl-cu12
+          
+          ./scripts/basic-workflow-tests.sh -cemf
+
+      - name: Add comment to PR if the workflow failed
+        if: failure() && steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR if the workflow succeeded
+        if: success() && steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  stop-runner:
+    name: Stop external EC2 runner
+    needs:
+      - start-runner
+      - e2e
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
diff --git a/.gitignore b/.gitignore
@@ -53,6 +53,7 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 cover/
+eval_output/
 
 # Translations
 *.mo

diff --git a/tests/test_branch_gen_answers.py b/tests/test_branch_gen_answers.py
@@ -1,10 +1,12 @@
 # First Party
 from instructlab.eval.mt_bench import MTBenchBranchEvaluator
 
-mt_bench_branch = MTBenchBranchEvaluator(
-    "instructlab/granite-7b-lab",
-    "instructlab/granite-7b-lab",
-    "../taxonomy",
-    "main",
-)
-mt_bench_branch.gen_answers("http://localhost:8000/v1")
+
+def test_branch_gen_answers():
+    mt_bench_branch = MTBenchBranchEvaluator(
+        "instructlab/granite-7b-lab",
+        "instructlab/granite-7b-lab",
+        "../taxonomy",
+        "main",
+    )
+    mt_bench_branch.gen_answers("http://localhost:8000/v1")
diff --git a/tests/test_branch_judge_answers.py b/tests/test_branch_judge_answers.py
@@ -4,24 +4,26 @@
 # First Party
 from instructlab.eval.mt_bench import MTBenchBranchEvaluator
 
-mt_bench_branch = MTBenchBranchEvaluator(
-    "instructlab/granite-7b-lab",
-    "instructlab/granite-7b-lab",
-    "../taxonomy",
-    "main",
-)
-qa_pairs, error_rate = mt_bench_branch.judge_answers("http://localhost:8000/v1")
-print(f"Error Rate: {error_rate}")
-print(f"QA Pair 0:")
-pprint.pprint(qa_pairs[0])
 
-print(f"qa_pairs length: {len(qa_pairs)}")
+def test_branch_judge_answers():
+    mt_bench_branch = MTBenchBranchEvaluator(
+        "instructlab/granite-7b-lab",
+        "instructlab/granite-7b-lab",
+        "../taxonomy",
+        "main",
+    )
+    qa_pairs, error_rate = mt_bench_branch.judge_answers("http://localhost:8000/v1")
+    print(f"Error Rate: {error_rate}")
+    print(f"QA Pair 0:")
+    pprint.pprint(qa_pairs[0])
 
-for qa_pair in qa_pairs:
-    question_id = qa_pair.get("question_id")
-    assert question_id is not None
-    assert qa_pair.get("score") is not None
-    assert qa_pair.get("category") is not None
-    assert qa_pair.get("question") is not None
-    assert qa_pair.get("answer") is not None
-    assert qa_pair.get("qna_file") is not None
+    print(f"qa_pairs length: {len(qa_pairs)}")
+
+    for qa_pair in qa_pairs:
+        question_id = qa_pair.get("question_id")
+        assert question_id is not None
+        assert qa_pair.get("score") is not None
+        assert qa_pair.get("category") is not None
+        assert qa_pair.get("question") is not None
+        assert qa_pair.get("answer") is not None
+        assert qa_pair.get("qna_file") is not None
diff --git a/tests/test_gen_answers.py b/tests/test_gen_answers.py
@@ -1,5 +1,9 @@
 # First Party
 from instructlab.eval.mt_bench import MTBenchEvaluator
 
-mt_bench = MTBenchEvaluator("instructlab/granite-7b-lab", "instructlab/granite-7b-lab")
-mt_bench.gen_answers("http://localhost:8000/v1")
+
+def test_gen_answers():
+    mt_bench = MTBenchEvaluator(
+        "instructlab/granite-7b-lab", "instructlab/granite-7b-lab"
+    )
+    mt_bench.gen_answers("http://localhost:8000/v1")
diff --git a/tests/test_judge_answers.py b/tests/test_judge_answers.py
@@ -4,23 +4,27 @@
 # First Party
 from instructlab.eval.mt_bench import MTBenchEvaluator
 
-mt_bench = MTBenchEvaluator("instructlab/granite-7b-lab", "instructlab/granite-7b-lab")
-overall_score, qa_pairs, turn_scores, error_rate = mt_bench.judge_answers(
-    "http://localhost:8000/v1"
-)
 
-print(f"Overall Score: {overall_score}")
-print(f"Turn 1 Score: {turn_scores[0]}")
-print(f"Turn 2 Score: {turn_scores[1]}")
-print(f"Error Rate: {error_rate}")
-print(f"QA Pair 0:")
-pprint.pprint(qa_pairs[0])
+def test_judge_answers():
+    mt_bench = MTBenchEvaluator(
+        "instructlab/granite-7b-lab", "instructlab/granite-7b-lab"
+    )
+    overall_score, qa_pairs, turn_scores, error_rate = mt_bench.judge_answers(
+        "http://localhost:8000/v1"
+    )
 
-print(f"qa_pairs length: {len(qa_pairs)}")
+    print(f"Overall Score: {overall_score}")
+    print(f"Turn 1 Score: {turn_scores[0]}")
+    print(f"Turn 2 Score: {turn_scores[1]}")
+    print(f"Error Rate: {error_rate}")
+    print(f"QA Pair 0:")
+    pprint.pprint(qa_pairs[0])
 
-for qa_pair in qa_pairs:
-    assert qa_pair.get("question_id") is not None
-    assert qa_pair.get("score") is not None
-    assert qa_pair.get("category") is not None
-    assert qa_pair.get("question") is not None
-    assert qa_pair.get("answer") is not None
+    print(f"qa_pairs length: {len(qa_pairs)}")
+
+    for qa_pair in qa_pairs:
+        assert qa_pair.get("question_id") is not None
+        assert qa_pair.get("score") is not None
+        assert qa_pair.get("category") is not None
+        assert qa_pair.get("question") is not None
+        assert qa_pair.get("answer") is not None
diff --git a/tests/test_mmlu.py b/tests/test_mmlu.py
@@ -3,7 +3,6 @@
 
 
 def test_minimal_mmlu():
-    print("===> Executing 'test_minimal_mmlu'...")
     try:
         model_path = "instructlab/granite-7b-lab"
         tasks = ["mmlu_anatomy", "mmlu_astronomy"]
@@ -15,7 +14,3 @@ def test_minimal_mmlu():
         print(f"'test_minimal_mmlu' failed: {exc}")
         return False
     return True
-
-
-if __name__ == "__main__":
-    assert test_minimal_mmlu() == True
diff --git a/tests/test_mmlubranch.py b/tests/test_mmlubranch.py
@@ -6,7 +6,6 @@
 
 
 def test_mmlu_branch():
-    print("===> Executing 'test_mmlu_branch'...")
     try:
         model_path = "instructlab/granite-7b-lab"
         sdg_path = f"{os.path.dirname(os.path.realpath(__file__))}/testdata/sdg"
@@ -21,7 +20,3 @@ def test_mmlu_branch():
         print(f"'test_mmlu_branch' failed: {exc}")
         return False
     return True
-
-
-if __name__ == "__main__":
-    assert test_mmlu_branch() == True