Skip to content

test_fixup 3,3,3,3 64k sonnet #411

test_fixup 3,3,3,3 64k sonnet

test_fixup 3,3,3,3 64k sonnet #411

Workflow file for this run

name: Run the benchmark
on:
workflow_dispatch:
inputs:
llm:
description: "LLM model to use"
type: choice
required: true
options:
- gemini-1.5-pro-002
- gpt-4o-2024-08-06
- gpt-4o-2024-05-13
- claude-3-5-sonnet-20240620
- claude-3-5-sonnet-20241022
- claude-3-haiku-20240307
- o1-preview-2024-09-12
- o1-mini-2024-09-12
instance_set:
description: "Instance set to solve"
type: string
limits:
description: "Limits to apply to the solver"
type: choice
required: true
options:
- default
# Test solvers
- test_files=3 test_status_retry=3 code_files=0
# Code solvers building synthetic tests
- test_files=3 test_status_retry=1 code_files=3 code_status_retry=3
- test_files=4 test_status_retry=1 code_files=4 code_status_retry=2
# Test + code solvers
- test_files=2 test_status_retry=2 code_files=2 code_status_retry=2 concurrency=1
- test_files=2 test_status_retry=2 code_files=2 code_status_retry=2 concurrency=2
- test_files=2 test_status_retry=2 code_files=2 code_status_retry=2 concurrency=4
- test_files=3 test_status_retry=3 code_files=3 code_status_retry=3
- test_files=4 test_status_retry=2 code_files=4 code_status_retry=2
# Code file selection
- test_files=0 code_files=3 code_status_retry=0
# Code solvers using only known synthetic tests (ensure that "Use synthetic tests" is enabled)
- test_files=0 code_files=3 code_status_retry=3
- test_files=0 code_files=4 code_status_retry=2
- test_files=0 code_files=6 code_status_retry=3
context_tokens:
description: "LLM token limit to apply to the solver"
type: string
required: false
default: "8000"
use_synthetic_tests:
description: "Use synthetic tests"
type: boolean
required: false
default: true
observe_synthetic_tests:
description: "Observe synthetic tests"
type: boolean
required: false
default: false
choose_code_files_only:
description: "Choose code files only"
type: boolean
required: false
default: false
runner:
description: "Runner type"
required: true
default: SWE-Bench_Larger
type: choice
options:
- ubuntu-latest
- swe-bench-ubuntu-latest
- SWE-Bench_Larger
num_runners:
description: "Number of runners to split the workload across"
required: true
default: "2"
name:
description: "Assign a name to the workflow run"
type: string
required: false
pull_request:
types: [opened, synchronize, reopened, labeled]
run-name: ${{ inputs.name || github.event.pull_request.title || github.event.workflow.name }}
permissions:
contents: read
pull-requests: read
packages: write
jobs:
show-inputs:
runs-on: 'ubuntu-latest'
steps:
- name: Display Input Values
run: |
echo "llm: ${{ github.event.inputs.llm }}"
echo "instance_set: ${{ github.event.inputs.instance_set }}"
echo "limits: ${{ github.event.inputs.limits }}"
echo "context_tokens: ${{ github.event.inputs.context_tokens }}"
echo "use_synthetic_tests: ${{ github.event.inputs.use_synthetic_tests }}"
echo "observe_synthetic_tests: ${{ github.event.inputs.observe_synthetic_tests }}"
echo "choose_code_files_only: ${{ github.event.inputs.choose_code_files_only }}"
echo "runner: ${{ github.event.inputs.runner }}"
echo "num_runners: ${{ github.event.inputs.num_runners }}"
echo "name: ${{ github.event.inputs.name }}"
build-appmap-js:
uses: ./.github/workflows/build_appmap_js.yml
prepare-matrix:
runs-on: ubuntu-latest
env:
NUM_RUNNERS: ${{ inputs.num_runners }}
outputs:
matrix: ${{ steps.prepare-matrix.outputs.matrix }}
steps:
- name: Prepare matrix
id: prepare-matrix
run: |
num_runners=${NUM_RUNNERS:-2}
echo "Number of runners: $num_runners"
indices=$(seq 0 $(($num_runners - 1)) | jq -R 'tonumber' | jq -s -c)
echo "Matrix: $indices"
echo "matrix=$indices" >> $GITHUB_OUTPUT
solve:
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-solve') || github.event_name == 'workflow_dispatch' }}
needs:
- build-appmap-js
- prepare-matrix
runs-on: ${{ inputs.runner || 'SWE-Bench_Larger' }}
continue-on-error: true
strategy:
matrix:
index: ${{ fromJson(needs['prepare-matrix'].outputs.matrix) }}
env:
NUM_RUNNERS: ${{ inputs.num_runners }}
INSTANCE_SET: ${{ inputs.instance_set }}
LIMITS: ${{ inputs.limits }}
CONTEXT_TOKENS: ${{ inputs.context_tokens }}
LLM: ${{ inputs.llm }}
USE_SYNTHETIC_TESTS: ${{ inputs.use_synthetic_tests }}
OBSERVE_SYNTHETIC_TESTS: ${{ inputs.observe_synthetic_tests }}
CHOOSE_CODE_FILES_ONLY: ${{ inputs.choose_code_files_only }}
defaults:
run:
shell: bash -leo pipefail {0}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: '3.12'
# Restore the appmap-js build
- name: Restore appmap-js build
uses: actions/cache/restore@v4
id: cache-appmap-js
with:
fail-on-cache-miss: true
path: |
submodules/appmap-js/node_modules
submodules/appmap-js/packages/*/built
submodules/appmap-js/packages/*/dist
submodules/appmap-js/packages/*/node_modules
key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }}
- name: Set up Node.js
if: steps.cache-appmap-js.outputs.cache-hit != 'true'
uses: actions/setup-node@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Solve instances
run: |
pip install virtualenv
virtualenv venv
. ./venv/bin/activate
pip install -e .
export PYTHONPATH=$PYTHONPATH:$(pwd)
export APPMAP_COMMAND="node $(pwd)/submodules/appmap-js/packages/cli/built/cli.js"
git config --global init.defaultBranch main
git config --global user.email "[email protected]"
git config --global user.name "GitHub Workflow"
llm="${LLM:-gpt-4o}"
export APPMAP_NAVIE_MODEL="${llm}"
if [[ $llm == "gpt-"* || $llm == "o1-"* ]]; then
export OPENAI_API_KEY="${{ secrets.OPENAI_API_KEY }}"
elif [[ $llm == "claude"* ]]; then
export ANTHROPIC_API_KEY="${{ secrets.ANTHROPIC_API_KEY }}"
elif [[ $llm == gemini* ]]; then
export APPMAP_NAVIE_MINI_MODEL="${llm}"
export GOOGLE_WEB_CREDENTIALS='${{ secrets.GOOGLE_WEB_CREDENTIALS}}'
else
echo "Unknown LLM model: $llm"
exit 1
fi
instance_set="${INSTANCE_SET:-smoke}"
context_tokens="${CONTEXT_TOKENS:-}"
use_synthetic_tests="${USE_SYNTHETIC_TESTS:-true}"
observe_synthetic_tests="${OBSERVE_SYNTHETIC_TESTS:-false}"
choose_code_files_only="${CHOOSE_CODE_FILES_ONLY:-false}"
limits="${LIMITS:-default}"
num_runners="${NUM_RUNNERS:-2}"
runner_index="${{ matrix.index }}"
# When running the smoke test with "default" limits, remove some of the precomputed test patches to be sure
# that the solver is actually solving the instances. This is used for testing the solver itself in CI.
if [ "${instance_set}" == "smoke" ] && [ "${limits}" == "default" ]; then
rm -f data/test_patches/pytest-dev__pytest-10051.json
rm -f data/test_patches/django__django-14559.json
fi
# If context_tokens is not empty, prepend it to the limits variable
if [ -n "${context_tokens}" ]; then
# Clear limits if it's "default"
[ "${limits}" == "default" ] && limits=""
limits="context_tokens=${context_tokens} ${limits}"
fi
python -m solver.prepare_images \
--instance_set "${instance_set}" \
--num_runners "${num_runners}" \
--runner_index "${runner_index}"
if [ "${use_synthetic_tests}" == "false" ]; then
echo Removing synthetic tests by deleting data/test_patches
rm -rf data/test_patches
fi
python -m solver.solve \
--instance_set "${instance_set}" \
$( [ "${limits}" != "default" ] && echo "--limit ${limits}" ) \
$( [ "${observe_synthetic_tests}" == "true" ] && echo "--observe_tests" ) \
$( [ "${choose_code_files_only}" == "true" ] && echo "--choose_code_files_only" ) \
--num_runners "${num_runners}" \
--runner_index "${runner_index}"
touch predictions.jsonl
python -m swebench.harness.run_evaluation \
--predictions_path predictions.jsonl \
--run_id "${instance_set}"
echo "Saving evaluation results"
mkdir -p evaluations
cp -r navie_*.${instance_set}.json evaluations/
find evaluations
- name: Report predictions
uses: actions/upload-artifact@v4
if: always()
with:
name: predictions-${{ matrix.index }}
path: predictions.jsonl
- name: Report harness logs
uses: actions/upload-artifact@v4
if: always()
with:
name: run_evaluation-${{ matrix.index }}
path: logs/run_evaluation
- name: Report solver logs
uses: actions/upload-artifact@v4
if: always()
with:
retention-days: 14
name: solve-${{ matrix.index }}
path: |
solve
!solve/*/source
report:
needs:
- solve
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
- name: Restore artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: '3.12'
- name: Install dependencies
run: |
pip install virtualenv
virtualenv venv
. ./venv/bin/activate
pip install -e .
# Artifacts dir contains files like this:
# artifacts/solve-0
# artifacts/solve-0/scikit-learn__scikit-learn-13779
# artifacts/solve-0/scikit-learn__scikit-learn-13779/navie
# artifacts/solve-0/scikit-learn__scikit-learn-13779/navie/solution.json
# artifacts/solve-0/scikit-learn__scikit-learn-13779/navie/plan
# artifacts/solve-1
# artifacts/solve-1/django__django-13779
# ...
# Create a solve directory and rsync the contents of each solve-* directory into it
# solution.json files and other structured data that will be used by the report script is left as-is
# If artifacts dir exists
mkdir -p solve
for dir in artifacts/solve-*; do
rsync -a $dir/ solve/
done
# Artifacts dir contains files like this:
# artifacts/run_evaluation-1
# artifacts/run_evaluation-1/...
# artifacts/run_evaluation-0
# artifacts/run_evaluation-0/...
# Roughly mimic the run_evaluation directory structure, except don't worry about the model name or run_id
mkdir -p logs/run_evaluation
any_run_evaluation_dir=$(ls -d artifacts/run_evaluation-* | head -n 1)
if [ -n "${any_run_evaluation_dir}" ]; then
for dir in artifacts/run_evaluation-*; do
rsync -a $dir/ logs/run_evaluation/
done
fi
# Artifacts dir contains files like this:
# logs/solve/evaluations-1
# artifacts/evaluations-1/navie_082024+gpt-4o.smoke.json
# artifacts/evaluations-0
# artifacts/evaluations-0/navie_082024+gpt-4o.smoke.json
# These can be left as-is, because being JSON they need to be combined by the report script
# Predictions are available:
# artifacts/predictions-1
# artifacts/predictions-1/predictions.jsonl
# artifacts/predictions-0
# artifacts/predictions-0/predictions.jsonl
# Concatenate these into a single file
for file in artifacts/predictions-*/*.jsonl; do
cat $file >> predictions.jsonl
done
- name: Report results
run: |
. ./venv/bin/activate
python -m solver.report \
--predictions_path predictions.jsonl \
--solve_data_dir solve \
--evaluation_logs_dir logs/run_evaluation
- name: Report solution.json files
uses: actions/upload-artifact@v4
with:
name: solutions
path: solve/**/solution.json
- name: Report test_patch.json files
uses: actions/upload-artifact@v4
with:
name: test-patch
path: solve/**/test_patch.json
- name: Upload report
uses: actions/upload-artifact@v4
with:
name: report
path: report.csv