test_fixup 3,3,3,3 64k sonnet #411
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Run the benchmark | |
on: | |
workflow_dispatch: | |
inputs: | |
llm: | |
description: "LLM model to use" | |
type: choice | |
required: true | |
options: | |
- gemini-1.5-pro-002 | |
- gpt-4o-2024-08-06 | |
- gpt-4o-2024-05-13 | |
- claude-3-5-sonnet-20240620 | |
- claude-3-5-sonnet-20241022 | |
- claude-3-haiku-20240307 | |
- o1-preview-2024-09-12 | |
- o1-mini-2024-09-12 | |
instance_set: | |
description: "Instance set to solve" | |
type: string | |
limits: | |
description: "Limits to apply to the solver" | |
type: choice | |
required: true | |
options: | |
- default | |
# Test solvers | |
- test_files=3 test_status_retry=3 code_files=0 | |
# Code solvers building synthetic tests | |
- test_files=3 test_status_retry=1 code_files=3 code_status_retry=3 | |
- test_files=4 test_status_retry=1 code_files=4 code_status_retry=2 | |
# Test + code solvers | |
- test_files=2 test_status_retry=2 code_files=2 code_status_retry=2 concurrency=1 | |
- test_files=2 test_status_retry=2 code_files=2 code_status_retry=2 concurrency=2 | |
- test_files=2 test_status_retry=2 code_files=2 code_status_retry=2 concurrency=4 | |
- test_files=3 test_status_retry=3 code_files=3 code_status_retry=3 | |
- test_files=4 test_status_retry=2 code_files=4 code_status_retry=2 | |
# Code file selection | |
- test_files=0 code_files=3 code_status_retry=0 | |
# Code solvers using only known synthetic tests (ensure that "Use synthetic tests" is enabled) | |
- test_files=0 code_files=3 code_status_retry=3 | |
- test_files=0 code_files=4 code_status_retry=2 | |
- test_files=0 code_files=6 code_status_retry=3 | |
context_tokens: | |
description: "LLM token limit to apply to the solver" | |
type: string | |
required: false | |
default: "8000" | |
use_synthetic_tests: | |
description: "Use synthetic tests" | |
type: boolean | |
required: false | |
default: true | |
observe_synthetic_tests: | |
description: "Observe synthetic tests" | |
type: boolean | |
required: false | |
default: false | |
choose_code_files_only: | |
description: "Choose code files only" | |
type: boolean | |
required: false | |
default: false | |
runner: | |
description: "Runner type" | |
required: true | |
default: SWE-Bench_Larger | |
type: choice | |
options: | |
- ubuntu-latest | |
- swe-bench-ubuntu-latest | |
- SWE-Bench_Larger | |
num_runners: | |
description: "Number of runners to split the workload across" | |
required: true | |
default: "2" | |
name: | |
description: "Assign a name to the workflow run" | |
type: string | |
required: false | |
pull_request: | |
types: [opened, synchronize, reopened, labeled] | |
run-name: ${{ inputs.name || github.event.pull_request.title || github.event.workflow.name }} | |
permissions: | |
contents: read | |
pull-requests: read | |
packages: write | |
jobs: | |
show-inputs: | |
runs-on: 'ubuntu-latest' | |
steps: | |
- name: Display Input Values | |
run: | | |
echo "llm: ${{ github.event.inputs.llm }}" | |
echo "instance_set: ${{ github.event.inputs.instance_set }}" | |
echo "limits: ${{ github.event.inputs.limits }}" | |
echo "context_tokens: ${{ github.event.inputs.context_tokens }}" | |
echo "use_synthetic_tests: ${{ github.event.inputs.use_synthetic_tests }}" | |
echo "observe_synthetic_tests: ${{ github.event.inputs.observe_synthetic_tests }}" | |
echo "choose_code_files_only: ${{ github.event.inputs.choose_code_files_only }}" | |
echo "runner: ${{ github.event.inputs.runner }}" | |
echo "num_runners: ${{ github.event.inputs.num_runners }}" | |
echo "name: ${{ github.event.inputs.name }}" | |
build-appmap-js: | |
uses: ./.github/workflows/build_appmap_js.yml | |
prepare-matrix: | |
runs-on: ubuntu-latest | |
env: | |
NUM_RUNNERS: ${{ inputs.num_runners }} | |
outputs: | |
matrix: ${{ steps.prepare-matrix.outputs.matrix }} | |
steps: | |
- name: Prepare matrix | |
id: prepare-matrix | |
run: | | |
num_runners=${NUM_RUNNERS:-2} | |
echo "Number of runners: $num_runners" | |
indices=$(seq 0 $(($num_runners - 1)) | jq -R 'tonumber' | jq -s -c) | |
echo "Matrix: $indices" | |
echo "matrix=$indices" >> $GITHUB_OUTPUT | |
solve: | |
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-solve') || github.event_name == 'workflow_dispatch' }} | |
needs: | |
- build-appmap-js | |
- prepare-matrix | |
runs-on: ${{ inputs.runner || 'SWE-Bench_Larger' }} | |
continue-on-error: true | |
strategy: | |
matrix: | |
index: ${{ fromJson(needs['prepare-matrix'].outputs.matrix) }} | |
env: | |
NUM_RUNNERS: ${{ inputs.num_runners }} | |
INSTANCE_SET: ${{ inputs.instance_set }} | |
LIMITS: ${{ inputs.limits }} | |
CONTEXT_TOKENS: ${{ inputs.context_tokens }} | |
LLM: ${{ inputs.llm }} | |
USE_SYNTHETIC_TESTS: ${{ inputs.use_synthetic_tests }} | |
OBSERVE_SYNTHETIC_TESTS: ${{ inputs.observe_synthetic_tests }} | |
CHOOSE_CODE_FILES_ONLY: ${{ inputs.choose_code_files_only }} | |
defaults: | |
run: | |
shell: bash -leo pipefail {0} | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: true | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: '3.12' | |
# Restore the appmap-js build | |
- name: Restore appmap-js build | |
uses: actions/cache/restore@v4 | |
id: cache-appmap-js | |
with: | |
fail-on-cache-miss: true | |
path: | | |
submodules/appmap-js/node_modules | |
submodules/appmap-js/packages/*/built | |
submodules/appmap-js/packages/*/dist | |
submodules/appmap-js/packages/*/node_modules | |
key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }} | |
- name: Set up Node.js | |
if: steps.cache-appmap-js.outputs.cache-hit != 'true' | |
uses: actions/setup-node@v3 | |
- name: Login to GitHub Container Registry | |
uses: docker/login-action@v3 | |
with: | |
registry: ghcr.io | |
username: ${{ github.actor }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Solve instances | |
run: | | |
pip install virtualenv | |
virtualenv venv | |
. ./venv/bin/activate | |
pip install -e . | |
export PYTHONPATH=$PYTHONPATH:$(pwd) | |
export APPMAP_COMMAND="node $(pwd)/submodules/appmap-js/packages/cli/built/cli.js" | |
git config --global init.defaultBranch main | |
git config --global user.email "[email protected]" | |
git config --global user.name "GitHub Workflow" | |
llm="${LLM:-gpt-4o}" | |
export APPMAP_NAVIE_MODEL="${llm}" | |
if [[ $llm == "gpt-"* || $llm == "o1-"* ]]; then | |
export OPENAI_API_KEY="${{ secrets.OPENAI_API_KEY }}" | |
elif [[ $llm == "claude"* ]]; then | |
export ANTHROPIC_API_KEY="${{ secrets.ANTHROPIC_API_KEY }}" | |
elif [[ $llm == gemini* ]]; then | |
export APPMAP_NAVIE_MINI_MODEL="${llm}" | |
export GOOGLE_WEB_CREDENTIALS='${{ secrets.GOOGLE_WEB_CREDENTIALS}}' | |
else | |
echo "Unknown LLM model: $llm" | |
exit 1 | |
fi | |
instance_set="${INSTANCE_SET:-smoke}" | |
context_tokens="${CONTEXT_TOKENS:-}" | |
use_synthetic_tests="${USE_SYNTHETIC_TESTS:-true}" | |
observe_synthetic_tests="${OBSERVE_SYNTHETIC_TESTS:-false}" | |
choose_code_files_only="${CHOOSE_CODE_FILES_ONLY:-false}" | |
limits="${LIMITS:-default}" | |
num_runners="${NUM_RUNNERS:-2}" | |
runner_index="${{ matrix.index }}" | |
# When running the smoke test with "default" limits, remove some of the precomputed test patches to be sure | |
# that the solver is actually solving the instances. This is used for testing the solver itself in CI. | |
if [ "${instance_set}" == "smoke" ] && [ "${limits}" == "default" ]; then | |
rm -f data/test_patches/pytest-dev__pytest-10051.json | |
rm -f data/test_patches/django__django-14559.json | |
fi | |
# If context_tokens is not empty, prepend it to the limits variable | |
if [ -n "${context_tokens}" ]; then | |
# Clear limits if it's "default" | |
[ "${limits}" == "default" ] && limits="" | |
limits="context_tokens=${context_tokens} ${limits}" | |
fi | |
python -m solver.prepare_images \ | |
--instance_set "${instance_set}" \ | |
--num_runners "${num_runners}" \ | |
--runner_index "${runner_index}" | |
if [ "${use_synthetic_tests}" == "false" ]; then | |
echo Removing synthetic tests by deleting data/test_patches | |
rm -rf data/test_patches | |
fi | |
python -m solver.solve \ | |
--instance_set "${instance_set}" \ | |
$( [ "${limits}" != "default" ] && echo "--limit ${limits}" ) \ | |
$( [ "${observe_synthetic_tests}" == "true" ] && echo "--observe_tests" ) \ | |
$( [ "${choose_code_files_only}" == "true" ] && echo "--choose_code_files_only" ) \ | |
--num_runners "${num_runners}" \ | |
--runner_index "${runner_index}" | |
touch predictions.jsonl | |
python -m swebench.harness.run_evaluation \ | |
--predictions_path predictions.jsonl \ | |
--run_id "${instance_set}" | |
echo "Saving evaluation results" | |
mkdir -p evaluations | |
cp -r navie_*.${instance_set}.json evaluations/ | |
find evaluations | |
- name: Report predictions | |
uses: actions/upload-artifact@v4 | |
if: always() | |
with: | |
name: predictions-${{ matrix.index }} | |
path: predictions.jsonl | |
- name: Report harness logs | |
uses: actions/upload-artifact@v4 | |
if: always() | |
with: | |
name: run_evaluation-${{ matrix.index }} | |
path: logs/run_evaluation | |
- name: Report solver logs | |
uses: actions/upload-artifact@v4 | |
if: always() | |
with: | |
retention-days: 14 | |
name: solve-${{ matrix.index }} | |
path: | | |
solve | |
!solve/*/source | |
report: | |
needs: | |
- solve | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: true | |
- name: Restore artifacts | |
uses: actions/download-artifact@v4 | |
with: | |
path: artifacts | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: '3.12' | |
- name: Install dependencies | |
run: | | |
pip install virtualenv | |
virtualenv venv | |
. ./venv/bin/activate | |
pip install -e . | |
# Artifacts dir contains files like this: | |
# artifacts/solve-0 | |
# artifacts/solve-0/scikit-learn__scikit-learn-13779 | |
# artifacts/solve-0/scikit-learn__scikit-learn-13779/navie | |
# artifacts/solve-0/scikit-learn__scikit-learn-13779/navie/solution.json | |
# artifacts/solve-0/scikit-learn__scikit-learn-13779/navie/plan | |
# artifacts/solve-1 | |
# artifacts/solve-1/django__django-13779 | |
# ... | |
# Create a solve directory and rsync the contents of each solve-* directory into it | |
# solution.json files and other structured data that will be used by the report script is left as-is | |
# If artifacts dir exists | |
mkdir -p solve | |
for dir in artifacts/solve-*; do | |
rsync -a $dir/ solve/ | |
done | |
# Artifacts dir contains files like this: | |
# artifacts/run_evaluation-1 | |
# artifacts/run_evaluation-1/... | |
# artifacts/run_evaluation-0 | |
# artifacts/run_evaluation-0/... | |
# Roughly mimic the run_evaluation directory structure, except don't worry about the model name or run_id | |
mkdir -p logs/run_evaluation | |
any_run_evaluation_dir=$(ls -d artifacts/run_evaluation-* | head -n 1) | |
if [ -n "${any_run_evaluation_dir}" ]; then | |
for dir in artifacts/run_evaluation-*; do | |
rsync -a $dir/ logs/run_evaluation/ | |
done | |
fi | |
# Artifacts dir contains files like this: | |
# logs/solve/evaluations-1 | |
# artifacts/evaluations-1/navie_082024+gpt-4o.smoke.json | |
# artifacts/evaluations-0 | |
# artifacts/evaluations-0/navie_082024+gpt-4o.smoke.json | |
# These can be left as-is, because being JSON they need to be combined by the report script | |
# Predictions are available: | |
# artifacts/predictions-1 | |
# artifacts/predictions-1/predictions.jsonl | |
# artifacts/predictions-0 | |
# artifacts/predictions-0/predictions.jsonl | |
# Concatenate these into a single file | |
for file in artifacts/predictions-*/*.jsonl; do | |
cat $file >> predictions.jsonl | |
done | |
- name: Report results | |
run: | | |
. ./venv/bin/activate | |
python -m solver.report \ | |
--predictions_path predictions.jsonl \ | |
--solve_data_dir solve \ | |
--evaluation_logs_dir logs/run_evaluation | |
- name: Report solution.json files | |
uses: actions/upload-artifact@v4 | |
with: | |
name: solutions | |
path: solve/**/solution.json | |
- name: Report test_patch.json files | |
uses: actions/upload-artifact@v4 | |
with: | |
name: test-patch | |
path: solve/**/test_patch.json | |
- name: Upload report | |
uses: actions/upload-artifact@v4 | |
with: | |
name: report | |
path: report.csv |