From 9ea4ee6ad6153864c1830b3a6e23491d81c6dc73 Mon Sep 17 00:00:00 2001 From: Nathan Weinberg Date: Wed, 23 Oct 2024 15:35:37 -0400 Subject: [PATCH] ci: add large-size E2E CI job this commit adds a new workflow to the Eval repo it will run a nightly cron job to test the current 'main' branch of Eval against the current 'main' branch of the CLI (instructlab) Signed-off-by: Nathan Weinberg --- .github/workflows/e2e-nvidia-l40s-x4.yml | 237 +++++++++++++++++++++++ README.md | 4 +- 2 files changed, 240 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/e2e-nvidia-l40s-x4.yml diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml new file mode 100644 index 0000000..aa25701 --- /dev/null +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: E2E (NVIDIA L40S x4) + +on: + schedule: + - cron: '0 16 * * *' # Runs at 4PM UTC every day + workflow_dispatch: + inputs: + pr_or_branch: + description: 'pull request number or branch name' + required: true + default: 'main' + +jobs: + start-large-ec2-runner: + runs-on: ubuntu-latest + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Start EC2 runner + id: start-ec2-runner + uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 + with: + mode: start + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + ec2-image-id: ami-01a89eee1adde309c + ec2-instance-type: g6e.12xlarge + subnet-id: subnet-024298cefa3bedd61 + security-group-id: sg-06300447c4a5fbef3 + iam-role-name: instructlab-ci-runner + aws-resource-tags: > + [ + {"Key": "Name", "Value": "instructlab-ci-github-large-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, + {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, + {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} + ] + + e2e-large-test: + needs: + - start-large-ec2-runner + runs-on: ${{ needs.start-large-ec2-runner.outputs.label }} + + permissions: + pull-requests: write + + steps: + - name: Install Packages + run: | + cat /etc/os-release + sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel + + - name: Checkout instructlab/instructlab + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + with: + repository: "instructlab/instructlab" + path: "instructlab" + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Checkout instructlab/eval + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + with: + repository: "instructlab/eval" + path: "eval" + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Determine if pr_or_branch is a PR number + id: check_pr + run: | + PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set + if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then + echo "is_pr=true" >> "$GITHUB_OUTPUT" + else + echo "is_pr=false" >> "$GITHUB_OUTPUT" + fi + echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT" + + - name: Check if gh cli is installed + id: gh_cli + run: | + if command -v gh &> /dev/null ; then + echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT" + else + echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT" + fi + + - name: Install gh CLI + if: steps.gh_cli.outputs.gh_cli_installed == 'false' + run: | + sudo dnf install 'dnf-command(config-manager)' -y + sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo + sudo dnf install gh --repo gh-cli -y + + - name: test gh CLI + run: | + gh --version + + - name: set default repo + working-directory: ./eval + run: | + gh repo set-default ${{ github.server_url }}/${{ github.repository }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Add comment to PR + if: steps.check_pr.outputs.is_pr == 'true' + working-directory: ./eval + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Fetch and checkout PR + if: steps.check_pr.outputs.is_pr == 'true' + working-directory: ./eval + run: | + gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Checkout branch + if: steps.check_pr.outputs.is_pr == 'false' + working-directory: ./eval + run: | + git checkout ${{ steps.check_pr.outputs.pr_or_branch }} + + - name: Install ilab + working-directory: ./instructlab + run: | + export CUDA_HOME="/usr/local/cuda" + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" + export PATH="$PATH:$CUDA_HOME/bin" + python3.11 -m venv --upgrade-deps venv + . venv/bin/activate + nvidia-smi + python3.11 -m pip cache remove llama_cpp_python + + CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install . + + # https://github.com/instructlab/instructlab/issues/1821 + # install with Torch and build dependencies installed + python3.11 -m pip install packaging wheel setuptools-scm + python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt + + - name: Update instructlab-eval library + working-directory: ./eval + run: | + . ../instructlab/venv/bin/activate + pip install . + pip install .[cuda] + + - name: Check disk + run: | + df -h + + - name: Run e2e test + working-directory: ./instructlab + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + . venv/bin/activate + ./scripts/e2e-ci.sh -l + + - name: Add comment to PR if the workflow failed + if: failure() && steps.check_pr.outputs.is_pr == 'true' + working-directory: ./eval + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate." + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Add comment to PR if the workflow succeeded + if: success() && steps.check_pr.outputs.is_pr == 'true' + working-directory: ./eval + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Post job results to Slack if the workflow failed + if: failure() && steps.check_pr.outputs.is_pr == 'false' + id: slack-report-failure + uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0 + with: + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. + channel-id: 'e2e-ci-results' + slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }} + + - name: Post job results to Slack if the workflow succeeded + if: success() && steps.check_pr.outputs.is_pr == 'false' + id: slack-report-success + uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0 + with: + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. + channel-id: 'e2e-ci-results' + slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }} + + stop-large-ec2-runner: + needs: + - start-large-ec2-runner + - e2e-large-test + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Stop EC2 runner + uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-large-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} diff --git a/README.md b/README.md index 049dd15..e81090f 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,13 @@ # eval ![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main) -![`e2e-nvidia-a10g-x1.yaml` on `main`](https://github.com/instructlab/eval/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main) ![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main) ![Release](https://img.shields.io/github/v/release/instructlab/eval) ![License](https://img.shields.io/github/license/instructlab/eval) +![`e2e-nvidia-a10g-x1.yml` on `main`](https://github.com/instructlab/eval/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main) +![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/eval/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main) + Python Library for Evaluation ## What is Evaluation?