diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
new file mode 100644
index 0000000..d3b6b22
--- /dev/null
+++ b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: E2E (NVIDIA A10G x1)
+
+on:
+  workflow_dispatch:
+    inputs:
+      pr_or_branch:
+        description: 'pull request number or branch name'
+        required: true
+        default: 'main'
+
+jobs:
+  start-runner:
+    name: Start external EC2 runner
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ami-00c51d9c1374eda97
+          ec2-instance-type: g5.2xlarge
+          subnet-id: subnet-02d230cffd9385bd4
+          security-group-id: sg-06300447c4a5fbef3
+          iam-role-name: instructlab-ci-runner
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-ci-github-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
+            ]
+
+  e2e:
+    name: E2E Test
+    needs: start-runner
+    runs-on: ${{ needs.start-runner.outputs.label }}
+
+    permissions:
+      pull-requests: write
+
+    steps:
+      - name: Checkout instructlab/eval
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Checkout instructlab/instructlab
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          repository: "instructlab/instructlab"
+          path: "instructlab"
+          fetch-depth: 0
+
+      - name: Determine if pr_or_branch is a PR number
+        id: check_pr
+        run: |
+          nvidia-smi
+          if [[ "${{ github.event.inputs.pr_or_branch }}" =~ ^[0-9]+$ ]]; then
+            echo "is_pr=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_pr=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Check if gh cli is installed
+        id: gh_cli
+        run: |
+          if command -v gh &> /dev/null ; then
+            echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Install gh CLI
+        if: steps.gh_cli.outputs.gh_cli_installed == 'false'
+        run: |
+          sudo dnf install 'dnf-command(config-manager)' -y
+          sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
+          sudo dnf install gh --repo gh-cli -y
+
+      - name: test gh CLI
+        run: |
+          gh --version
+
+      - name: set default repo
+        run: |
+          gh repo set-default ${{ github.server_url }}/${{ github.repository }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Fetch and checkout PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr checkout ${{ github.event.inputs.pr_or_branch }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout branch
+        if: steps.check_pr.outputs.is_pr == 'false'
+        run: |
+          git checkout ${{ github.event.inputs.pr_or_branch }}
+
+      - name: Install Packages
+        run: |
+          cat /etc/os-release
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+
+      - name: Install ilab
+        run: |
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRRY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+          export PATH="$PATH:$CUDA_HOME/bin"
+          python3.11 -m venv venv
+          . venv/bin/activate
+          nvidia-smi
+          sed 's/\[.*\]//' requirements.txt > constraints.txt
+          python3.11 -m pip cache remove llama_cpp_python
+          CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3.11 -m pip install --force-reinstall --no-binary llama_cpp_python -c constraints.txt llama_cpp_python
+          python3.11 -m pip install bitsandbytes
+
+          # TODO This should be added to instructlab-training
+          python3.11 -m pip install packaging wheel
+
+          python3.11 -m pip install instructlab-training[cuda]
+
+          # Install the local version of eval before installing the CLI so PR changes are included
+          python3.11 -m pip install .
+
+          python3.11 -m pip install instructlab
+
+      - name: Run e2e test
+        run: |
+          nvidia-smi
+          # This env variable is used on GPUs with less vRAM. It only allows cuda to alloc small chunks of vRAM at a time, usually helping to avoid OOM errors.
+          # This is not a good solution for production code, as setting ENV variables for users isn't best practice. However, it is a helpful manual workaround.
+          export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6,max_split_size_mb:128
+          . venv/bin/activate
+          # TODO: for some reason we need to reinstall DS in order to get fused adam support
+          # This means we need to manually rm and re-install a bunch of packages. Investigate why this is.
+          python3.11 -m pip uninstall -y deepspeed
+
+          python3.11 -m pip cache purge
+
+          DS_BUILD_CPU_ADAM=1 BUILD_UTILS=1 python3.11 -m pip install deepspeed
+
+          nvidia-smi
+
+          python3.11 -m pip show nvidia-nccl-cu12
+          
+          cd instructlab
+          ./scripts/basic-workflow-tests.sh -em
+
+      - name: Add comment to PR if the workflow failed
+        if: failure() && steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR if the workflow succeeded
+        if: success() && steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  stop-runner:
+    name: Stop external EC2 runner
+    needs:
+      - start-runner
+      - e2e
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..b815e03
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: Test
+
+on:
+  push:
+    branches:
+      - "main"
+      - "release-**"
+    paths:
+      - '**.py'
+      - 'pyproject.toml'
+      - 'requirements*.txt'
+      - '.github/workflows/test.yml'
+  pull_request:
+    branches:
+      - "main"
+      - "release-**"
+    paths:
+      - '**.py'
+      - 'pyproject.toml'
+      - 'requirements*.txt'
+      - '.github/workflows/test.yml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit:
+    runs-on: ubuntu-gpu
+    steps:
+      # No step-security/harden-runner since this is a self-hosted runner
+      - name: Checkout instructlab/eval
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      # this is needed for branch tests
+      - name: Checkout instructlab/taxonomy
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          repository: "instructlab/taxonomy"
+          path: "taxonomy"
+          fetch-depth: 0
+
+      # this is needed for judge_answer tests
+      - name: Checkout instructlab/instructlab
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          repository: "instructlab/instructlab"
+          path: "instructlab"
+          fetch-depth: 0
+
+      - name: Install system packages
+        run: |
+          sudo apt-get install -y cuda-toolkit git cmake build-essential virtualenv
+          nvidia-smi
+          sudo ls -l /dev/nvidia*
+
+      - name: Setup Python 3.11
+        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
+        with:
+          python-version: 3.11
+          cache: pip
+          cache-dependency-path: |
+            **/pyproject.toml
+            **/requirements*.txt
+
+      - name: Remove llama-cpp-python from cache
+        run: |
+          pip cache remove llama_cpp_python
+
+      - name: Start inference server
+        run: |
+          export PATH="/home/runner/.local/bin:/usr/local/cuda/bin:$PATH"
+          cd instructlab
+          python3.11 -m venv cli_venv
+          . cli_venv/bin/activate
+          sed 's/\[.*\]//' requirements.txt > constraints.txt
+          python3.11 -m pip cache remove llama_cpp_python
+          CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --no-binary llama_cpp_python -c constraints.txt llama_cpp_python
+          # needed for --4-bit-quant option to ilab train
+          python3.11 -m pip install bitsandbytes
+          # install instructlab
+          python3.11 -m pip install .
+          # start llama-cpp server
+          ilab model download --repository instructlab/granite-7b-lab-GGUF --filename granite-7b-lab-Q4_K_M.gguf
+          ilab model serve --model-path /home/runner/.local/share/instructlab/models/granite-7b-lab-Q4_K_M.gguf
+
+      - name: Install dependencies
+        run: |
+          python3.11 -m venv venv
+          . venv/bin/activate
+          python3.11 -m pip install .
+          python3.11 -m pip install pytest
+
+      - name: Run unit tests
+        run: |
+          export INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=5
+          . venv/bin/activate
+          python3.11 -m pytest
+
+      - name: Remove llama-cpp-python from cache
+        if: always()
+        run: |
+          pip cache remove llama_cpp_python
diff --git a/.gitignore b/.gitignore
index 164fd34..ed506ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,6 +53,7 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 cover/
+eval_output/
 
 # Translations
 *.mo
diff --git a/README.md b/README.md
index 55af0e2..56e0bfe 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # eval
 
 ![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main)
+![Test](https://github.com/instructlab/eval/actions/workflows/test.yml/badge.svg?branch=main)
 ![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main)
 ![Release](https://img.shields.io/github/v/release/instructlab/eval)
 ![License](https://img.shields.io/github/license/instructlab/eval)
diff --git a/tests/test_branch_gen_answers.py b/tests/test_branch_gen_answers.py
index 04f85ac..8b447d8 100755
--- a/tests/test_branch_gen_answers.py
+++ b/tests/test_branch_gen_answers.py
@@ -1,10 +1,12 @@
 # First Party
 from instructlab.eval.mt_bench import MTBenchBranchEvaluator
 
-mt_bench_branch = MTBenchBranchEvaluator(
-    "instructlab/granite-7b-lab",
-    "instructlab/granite-7b-lab",
-    "../taxonomy",
-    "main",
-)
-mt_bench_branch.gen_answers("http://localhost:8000/v1")
+
+def test_branch_gen_answers():
+    mt_bench_branch = MTBenchBranchEvaluator(
+        "instructlab/granite-7b-lab",
+        "instructlab/granite-7b-lab",
+        "taxonomy",
+        "main",
+    )
+    mt_bench_branch.gen_answers("http://localhost:8000/v1")
diff --git a/tests/test_branch_judge_answers.py b/tests/test_branch_judge_answers.py
index 5b2e566..8ec0b31 100755
--- a/tests/test_branch_judge_answers.py
+++ b/tests/test_branch_judge_answers.py
@@ -4,24 +4,26 @@
 # First Party
 from instructlab.eval.mt_bench import MTBenchBranchEvaluator
 
-mt_bench_branch = MTBenchBranchEvaluator(
-    "instructlab/granite-7b-lab",
-    "instructlab/granite-7b-lab",
-    "../taxonomy",
-    "main",
-)
-qa_pairs, error_rate = mt_bench_branch.judge_answers("http://localhost:8000/v1")
-print(f"Error Rate: {error_rate}")
-print(f"QA Pair 0:")
-pprint.pprint(qa_pairs[0])
 
-print(f"qa_pairs length: {len(qa_pairs)}")
+def test_branch_judge_answers():
+    mt_bench_branch = MTBenchBranchEvaluator(
+        "instructlab/granite-7b-lab",
+        "instructlab/granite-7b-lab",
+        "taxonomy",
+        "main",
+    )
+    qa_pairs, error_rate = mt_bench_branch.judge_answers("http://localhost:8000/v1")
+    print(f"Error Rate: {error_rate}")
+    print(f"QA Pair 0:")
+    pprint.pprint(qa_pairs[0])
 
-for qa_pair in qa_pairs:
-    question_id = qa_pair.get("question_id")
-    assert question_id is not None
-    assert qa_pair.get("score") is not None
-    assert qa_pair.get("category") is not None
-    assert qa_pair.get("question") is not None
-    assert qa_pair.get("answer") is not None
-    assert qa_pair.get("qna_file") is not None
+    print(f"qa_pairs length: {len(qa_pairs)}")
+
+    for qa_pair in qa_pairs:
+        question_id = qa_pair.get("question_id")
+        assert question_id is not None
+        assert qa_pair.get("score") is not None
+        assert qa_pair.get("category") is not None
+        assert qa_pair.get("question") is not None
+        assert qa_pair.get("answer") is not None
+        assert qa_pair.get("qna_file") is not None
diff --git a/tests/test_gen_answers.py b/tests/test_gen_answers.py
index 3eca8d3..1c40c96 100755
--- a/tests/test_gen_answers.py
+++ b/tests/test_gen_answers.py
@@ -1,5 +1,9 @@
 # First Party
 from instructlab.eval.mt_bench import MTBenchEvaluator
 
-mt_bench = MTBenchEvaluator("instructlab/granite-7b-lab", "instructlab/granite-7b-lab")
-mt_bench.gen_answers("http://localhost:8000/v1")
+
+def test_gen_answers():
+    mt_bench = MTBenchEvaluator(
+        "instructlab/granite-7b-lab", "instructlab/granite-7b-lab"
+    )
+    mt_bench.gen_answers("http://localhost:8000/v1")
diff --git a/tests/test_judge_answers.py b/tests/test_judge_answers.py
index 97f22a7..62333d7 100755
--- a/tests/test_judge_answers.py
+++ b/tests/test_judge_answers.py
@@ -4,23 +4,27 @@
 # First Party
 from instructlab.eval.mt_bench import MTBenchEvaluator
 
-mt_bench = MTBenchEvaluator("instructlab/granite-7b-lab", "instructlab/granite-7b-lab")
-overall_score, qa_pairs, turn_scores, error_rate = mt_bench.judge_answers(
-    "http://localhost:8000/v1"
-)
 
-print(f"Overall Score: {overall_score}")
-print(f"Turn 1 Score: {turn_scores[0]}")
-print(f"Turn 2 Score: {turn_scores[1]}")
-print(f"Error Rate: {error_rate}")
-print(f"QA Pair 0:")
-pprint.pprint(qa_pairs[0])
+def test_judge_answers():
+    mt_bench = MTBenchEvaluator(
+        "instructlab/granite-7b-lab", "instructlab/granite-7b-lab"
+    )
+    overall_score, qa_pairs, turn_scores, error_rate = mt_bench.judge_answers(
+        "http://localhost:8000/v1"
+    )
 
-print(f"qa_pairs length: {len(qa_pairs)}")
+    print(f"Overall Score: {overall_score}")
+    print(f"Turn 1 Score: {turn_scores[0]}")
+    print(f"Turn 2 Score: {turn_scores[1]}")
+    print(f"Error Rate: {error_rate}")
+    print(f"QA Pair 0:")
+    pprint.pprint(qa_pairs[0])
 
-for qa_pair in qa_pairs:
-    assert qa_pair.get("question_id") is not None
-    assert qa_pair.get("score") is not None
-    assert qa_pair.get("category") is not None
-    assert qa_pair.get("question") is not None
-    assert qa_pair.get("answer") is not None
+    print(f"qa_pairs length: {len(qa_pairs)}")
+
+    for qa_pair in qa_pairs:
+        assert qa_pair.get("question_id") is not None
+        assert qa_pair.get("score") is not None
+        assert qa_pair.get("category") is not None
+        assert qa_pair.get("question") is not None
+        assert qa_pair.get("answer") is not None
diff --git a/tests/test_mmlu.py b/tests/test_mmlu.py
index 2db46c0..084cb29 100755
--- a/tests/test_mmlu.py
+++ b/tests/test_mmlu.py
@@ -3,7 +3,6 @@
 
 
 def test_minimal_mmlu():
-    print("===> Executing 'test_minimal_mmlu'...")
     try:
         model_path = "instructlab/granite-7b-lab"
         tasks = ["mmlu_anatomy", "mmlu_astronomy"]
@@ -13,9 +12,5 @@ def test_minimal_mmlu():
         print(individual_scores)
     except Exception as exc:
         print(f"'test_minimal_mmlu' failed: {exc}")
-        return False
-    return True
-
-
-if __name__ == "__main__":
-    assert test_minimal_mmlu() == True
+    assert overall_score is not None
+    assert individual_scores is not None
diff --git a/tests/test_mmlubranch.py b/tests/test_mmlubranch.py
index 9344ccf..088bcd5 100755
--- a/tests/test_mmlubranch.py
+++ b/tests/test_mmlubranch.py
@@ -6,7 +6,6 @@
 
 
 def test_mmlu_branch():
-    print("===> Executing 'test_mmlu_branch'...")
     try:
         model_path = "instructlab/granite-7b-lab"
         sdg_path = f"{os.path.dirname(os.path.realpath(__file__))}/testdata/sdg"
@@ -19,9 +18,5 @@ def test_mmlu_branch():
         print(individual_scores)
     except Exception as exc:
         print(f"'test_mmlu_branch' failed: {exc}")
-        return False
-    return True
-
-
-if __name__ == "__main__":
-    assert test_mmlu_branch() == True
+    assert overall_score is not None
+    assert individual_scores is not None