diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml new file mode 100644 index 0000000..d3b6b22 --- /dev/null +++ b/.github/workflows/e2e-nvidia-a10g-x1.yml @@ -0,0 +1,205 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: E2E (NVIDIA A10G x1) + +on: + workflow_dispatch: + inputs: + pr_or_branch: + description: 'pull request number or branch name' + required: true + default: 'main' + +jobs: + start-runner: + name: Start external EC2 runner + runs-on: ubuntu-latest + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + - name: Start EC2 runner + id: start-ec2-runner + uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 + with: + mode: start + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + ec2-image-id: ami-00c51d9c1374eda97 + ec2-instance-type: g5.2xlarge + subnet-id: subnet-02d230cffd9385bd4 + security-group-id: sg-06300447c4a5fbef3 + iam-role-name: instructlab-ci-runner + aws-resource-tags: > + [ + {"Key": "Name", "Value": "instructlab-ci-github-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"} + ] + + e2e: + name: E2E Test + needs: start-runner + runs-on: ${{ needs.start-runner.outputs.label }} + + permissions: + pull-requests: write + + steps: + - name: Checkout instructlab/eval + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Checkout instructlab/instructlab + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + repository: "instructlab/instructlab" + path: "instructlab" + fetch-depth: 0 + + - name: Determine if pr_or_branch is a PR number + id: check_pr + run: | + nvidia-smi + if [[ "${{ github.event.inputs.pr_or_branch }}" =~ ^[0-9]+$ ]]; then + echo "is_pr=true" >> "$GITHUB_OUTPUT" + else + echo "is_pr=false" >> "$GITHUB_OUTPUT" + fi + + - name: Check if gh cli is installed + id: gh_cli + run: | + if command -v gh &> /dev/null ; then + echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT" + else + echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT" + fi + + - name: Install gh CLI + if: steps.gh_cli.outputs.gh_cli_installed == 'false' + run: | + sudo dnf install 'dnf-command(config-manager)' -y + sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo + sudo dnf install gh --repo gh-cli -y + + - name: test gh CLI + run: | + gh --version + + - name: set default repo + run: | + gh repo set-default ${{ github.server_url }}/${{ github.repository }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Add comment to PR + if: steps.check_pr.outputs.is_pr == 'true' + run: | + gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Fetch and checkout PR + if: steps.check_pr.outputs.is_pr == 'true' + run: | + gh pr checkout ${{ github.event.inputs.pr_or_branch }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Checkout branch + if: steps.check_pr.outputs.is_pr == 'false' + run: | + git checkout ${{ github.event.inputs.pr_or_branch }} + + - name: Install Packages + run: | + cat /etc/os-release + sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel + + - name: Install ilab + run: | + export CUDA_HOME="/usr/local/cuda" + export LD_LIBRRY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" + export PATH="$PATH:$CUDA_HOME/bin" + python3.11 -m venv venv + . venv/bin/activate + nvidia-smi + sed 's/\[.*\]//' requirements.txt > constraints.txt + python3.11 -m pip cache remove llama_cpp_python + CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3.11 -m pip install --force-reinstall --no-binary llama_cpp_python -c constraints.txt llama_cpp_python + python3.11 -m pip install bitsandbytes + + # TODO This should be added to instructlab-training + python3.11 -m pip install packaging wheel + + python3.11 -m pip install instructlab-training[cuda] + + # Install the local version of eval before installing the CLI so PR changes are included + python3.11 -m pip install . + + python3.11 -m pip install instructlab + + - name: Run e2e test + run: | + nvidia-smi + # This env variable is used on GPUs with less vRAM. It only allows cuda to alloc small chunks of vRAM at a time, usually helping to avoid OOM errors. + # This is not a good solution for production code, as setting ENV variables for users isn't best practice. However, it is a helpful manual workaround. + export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6,max_split_size_mb:128 + . venv/bin/activate + # TODO: for some reason we need to reinstall DS in order to get fused adam support + # This means we need to manually rm and re-install a bunch of packages. Investigate why this is. + python3.11 -m pip uninstall -y deepspeed + + python3.11 -m pip cache purge + + DS_BUILD_CPU_ADAM=1 BUILD_UTILS=1 python3.11 -m pip install deepspeed + + nvidia-smi + + python3.11 -m pip show nvidia-nccl-cu12 + + cd instructlab + ./scripts/basic-workflow-tests.sh -em + + - name: Add comment to PR if the workflow failed + if: failure() && steps.check_pr.outputs.is_pr == 'true' + run: | + gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate." + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Add comment to PR if the workflow succeeded + if: success() && steps.check_pr.outputs.is_pr == 'true' + run: | + gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + stop-runner: + name: Stop external EC2 runner + needs: + - start-runner + - e2e + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + - name: Stop EC2 runner + uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..b815e03 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Test + +on: + push: + branches: + - "main" + - "release-**" + paths: + - '**.py' + - 'pyproject.toml' + - 'requirements*.txt' + - '.github/workflows/test.yml' + pull_request: + branches: + - "main" + - "release-**" + paths: + - '**.py' + - 'pyproject.toml' + - 'requirements*.txt' + - '.github/workflows/test.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + unit: + runs-on: ubuntu-gpu + steps: + # No step-security/harden-runner since this is a self-hosted runner + - name: Checkout instructlab/eval + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + # this is needed for branch tests + - name: Checkout instructlab/taxonomy + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + repository: "instructlab/taxonomy" + path: "taxonomy" + fetch-depth: 0 + + # this is needed for judge_answer tests + - name: Checkout instructlab/instructlab + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + repository: "instructlab/instructlab" + path: "instructlab" + fetch-depth: 0 + + - name: Install system packages + run: | + sudo apt-get install -y cuda-toolkit git cmake build-essential virtualenv + nvidia-smi + sudo ls -l /dev/nvidia* + + - name: Setup Python 3.11 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + with: + python-version: 3.11 + cache: pip + cache-dependency-path: | + **/pyproject.toml + **/requirements*.txt + + - name: Remove llama-cpp-python from cache + run: | + pip cache remove llama_cpp_python + + - name: Start inference server + run: | + export PATH="/home/runner/.local/bin:/usr/local/cuda/bin:$PATH" + cd instructlab + python3.11 -m venv cli_venv + . cli_venv/bin/activate + sed 's/\[.*\]//' requirements.txt > constraints.txt + python3.11 -m pip cache remove llama_cpp_python + CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --no-binary llama_cpp_python -c constraints.txt llama_cpp_python + # needed for --4-bit-quant option to ilab train + python3.11 -m pip install bitsandbytes + # install instructlab + python3.11 -m pip install . + # start llama-cpp server + ilab model download --repository instructlab/granite-7b-lab-GGUF --filename granite-7b-lab-Q4_K_M.gguf + ilab model serve --model-path /home/runner/.local/share/instructlab/models/granite-7b-lab-Q4_K_M.gguf + + - name: Install dependencies + run: | + python3.11 -m venv venv + . venv/bin/activate + python3.11 -m pip install . + python3.11 -m pip install pytest + + - name: Run unit tests + run: | + export INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=5 + . venv/bin/activate + python3.11 -m pytest + + - name: Remove llama-cpp-python from cache + if: always() + run: | + pip cache remove llama_cpp_python diff --git a/.gitignore b/.gitignore index 164fd34..ed506ca 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,7 @@ coverage.xml .hypothesis/ .pytest_cache/ cover/ +eval_output/ # Translations *.mo diff --git a/README.md b/README.md index 55af0e2..56e0bfe 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # eval ![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main) +![Test](https://github.com/instructlab/eval/actions/workflows/test.yml/badge.svg?branch=main) ![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main) ![Release](https://img.shields.io/github/v/release/instructlab/eval) ![License](https://img.shields.io/github/license/instructlab/eval) diff --git a/tests/test_branch_gen_answers.py b/tests/test_branch_gen_answers.py index 04f85ac..8b447d8 100755 --- a/tests/test_branch_gen_answers.py +++ b/tests/test_branch_gen_answers.py @@ -1,10 +1,12 @@ # First Party from instructlab.eval.mt_bench import MTBenchBranchEvaluator -mt_bench_branch = MTBenchBranchEvaluator( - "instructlab/granite-7b-lab", - "instructlab/granite-7b-lab", - "../taxonomy", - "main", -) -mt_bench_branch.gen_answers("http://localhost:8000/v1") + +def test_branch_gen_answers(): + mt_bench_branch = MTBenchBranchEvaluator( + "instructlab/granite-7b-lab", + "instructlab/granite-7b-lab", + "taxonomy", + "main", + ) + mt_bench_branch.gen_answers("http://localhost:8000/v1") diff --git a/tests/test_branch_judge_answers.py b/tests/test_branch_judge_answers.py index 5b2e566..8ec0b31 100755 --- a/tests/test_branch_judge_answers.py +++ b/tests/test_branch_judge_answers.py @@ -4,24 +4,26 @@ # First Party from instructlab.eval.mt_bench import MTBenchBranchEvaluator -mt_bench_branch = MTBenchBranchEvaluator( - "instructlab/granite-7b-lab", - "instructlab/granite-7b-lab", - "../taxonomy", - "main", -) -qa_pairs, error_rate = mt_bench_branch.judge_answers("http://localhost:8000/v1") -print(f"Error Rate: {error_rate}") -print(f"QA Pair 0:") -pprint.pprint(qa_pairs[0]) -print(f"qa_pairs length: {len(qa_pairs)}") +def test_branch_judge_answers(): + mt_bench_branch = MTBenchBranchEvaluator( + "instructlab/granite-7b-lab", + "instructlab/granite-7b-lab", + "taxonomy", + "main", + ) + qa_pairs, error_rate = mt_bench_branch.judge_answers("http://localhost:8000/v1") + print(f"Error Rate: {error_rate}") + print(f"QA Pair 0:") + pprint.pprint(qa_pairs[0]) -for qa_pair in qa_pairs: - question_id = qa_pair.get("question_id") - assert question_id is not None - assert qa_pair.get("score") is not None - assert qa_pair.get("category") is not None - assert qa_pair.get("question") is not None - assert qa_pair.get("answer") is not None - assert qa_pair.get("qna_file") is not None + print(f"qa_pairs length: {len(qa_pairs)}") + + for qa_pair in qa_pairs: + question_id = qa_pair.get("question_id") + assert question_id is not None + assert qa_pair.get("score") is not None + assert qa_pair.get("category") is not None + assert qa_pair.get("question") is not None + assert qa_pair.get("answer") is not None + assert qa_pair.get("qna_file") is not None diff --git a/tests/test_gen_answers.py b/tests/test_gen_answers.py index 3eca8d3..1c40c96 100755 --- a/tests/test_gen_answers.py +++ b/tests/test_gen_answers.py @@ -1,5 +1,9 @@ # First Party from instructlab.eval.mt_bench import MTBenchEvaluator -mt_bench = MTBenchEvaluator("instructlab/granite-7b-lab", "instructlab/granite-7b-lab") -mt_bench.gen_answers("http://localhost:8000/v1") + +def test_gen_answers(): + mt_bench = MTBenchEvaluator( + "instructlab/granite-7b-lab", "instructlab/granite-7b-lab" + ) + mt_bench.gen_answers("http://localhost:8000/v1") diff --git a/tests/test_judge_answers.py b/tests/test_judge_answers.py index 97f22a7..62333d7 100755 --- a/tests/test_judge_answers.py +++ b/tests/test_judge_answers.py @@ -4,23 +4,27 @@ # First Party from instructlab.eval.mt_bench import MTBenchEvaluator -mt_bench = MTBenchEvaluator("instructlab/granite-7b-lab", "instructlab/granite-7b-lab") -overall_score, qa_pairs, turn_scores, error_rate = mt_bench.judge_answers( - "http://localhost:8000/v1" -) -print(f"Overall Score: {overall_score}") -print(f"Turn 1 Score: {turn_scores[0]}") -print(f"Turn 2 Score: {turn_scores[1]}") -print(f"Error Rate: {error_rate}") -print(f"QA Pair 0:") -pprint.pprint(qa_pairs[0]) +def test_judge_answers(): + mt_bench = MTBenchEvaluator( + "instructlab/granite-7b-lab", "instructlab/granite-7b-lab" + ) + overall_score, qa_pairs, turn_scores, error_rate = mt_bench.judge_answers( + "http://localhost:8000/v1" + ) -print(f"qa_pairs length: {len(qa_pairs)}") + print(f"Overall Score: {overall_score}") + print(f"Turn 1 Score: {turn_scores[0]}") + print(f"Turn 2 Score: {turn_scores[1]}") + print(f"Error Rate: {error_rate}") + print(f"QA Pair 0:") + pprint.pprint(qa_pairs[0]) -for qa_pair in qa_pairs: - assert qa_pair.get("question_id") is not None - assert qa_pair.get("score") is not None - assert qa_pair.get("category") is not None - assert qa_pair.get("question") is not None - assert qa_pair.get("answer") is not None + print(f"qa_pairs length: {len(qa_pairs)}") + + for qa_pair in qa_pairs: + assert qa_pair.get("question_id") is not None + assert qa_pair.get("score") is not None + assert qa_pair.get("category") is not None + assert qa_pair.get("question") is not None + assert qa_pair.get("answer") is not None diff --git a/tests/test_mmlu.py b/tests/test_mmlu.py index 2db46c0..084cb29 100755 --- a/tests/test_mmlu.py +++ b/tests/test_mmlu.py @@ -3,7 +3,6 @@ def test_minimal_mmlu(): - print("===> Executing 'test_minimal_mmlu'...") try: model_path = "instructlab/granite-7b-lab" tasks = ["mmlu_anatomy", "mmlu_astronomy"] @@ -13,9 +12,5 @@ def test_minimal_mmlu(): print(individual_scores) except Exception as exc: print(f"'test_minimal_mmlu' failed: {exc}") - return False - return True - - -if __name__ == "__main__": - assert test_minimal_mmlu() == True + assert overall_score is not None + assert individual_scores is not None diff --git a/tests/test_mmlubranch.py b/tests/test_mmlubranch.py index 9344ccf..088bcd5 100755 --- a/tests/test_mmlubranch.py +++ b/tests/test_mmlubranch.py @@ -6,7 +6,6 @@ def test_mmlu_branch(): - print("===> Executing 'test_mmlu_branch'...") try: model_path = "instructlab/granite-7b-lab" sdg_path = f"{os.path.dirname(os.path.realpath(__file__))}/testdata/sdg" @@ -19,9 +18,5 @@ def test_mmlu_branch(): print(individual_scores) except Exception as exc: print(f"'test_mmlu_branch' failed: {exc}") - return False - return True - - -if __name__ == "__main__": - assert test_mmlu_branch() == True + assert overall_score is not None + assert individual_scores is not None