test_fixup 3,3,3,3 64k sonnet #411

Workflow file for this run

	name: Run the benchmark

	on:
	workflow_dispatch:
	inputs:
	llm:
	description: "LLM model to use"
	type: choice
	required: true
	options:
	- gemini-1.5-pro-002
	- gpt-4o-2024-08-06
	- gpt-4o-2024-05-13
	- claude-3-5-sonnet-20240620
	- claude-3-5-sonnet-20241022
	- claude-3-haiku-20240307
	- o1-preview-2024-09-12
	- o1-mini-2024-09-12
	instance_set:
	description: "Instance set to solve"
	type: string
	limits:
	description: "Limits to apply to the solver"
	type: choice
	required: true
	options:
	- default
	# Test solvers
	- test_files=3 test_status_retry=3 code_files=0
	# Code solvers building synthetic tests
	- test_files=3 test_status_retry=1 code_files=3 code_status_retry=3
	- test_files=4 test_status_retry=1 code_files=4 code_status_retry=2
	# Test + code solvers
	- test_files=2 test_status_retry=2 code_files=2 code_status_retry=2 concurrency=1
	- test_files=2 test_status_retry=2 code_files=2 code_status_retry=2 concurrency=2
	- test_files=2 test_status_retry=2 code_files=2 code_status_retry=2 concurrency=4
	- test_files=3 test_status_retry=3 code_files=3 code_status_retry=3
	- test_files=4 test_status_retry=2 code_files=4 code_status_retry=2
	# Code file selection
	- test_files=0 code_files=3 code_status_retry=0
	# Code solvers using only known synthetic tests (ensure that "Use synthetic tests" is enabled)
	- test_files=0 code_files=3 code_status_retry=3
	- test_files=0 code_files=4 code_status_retry=2
	- test_files=0 code_files=6 code_status_retry=3
	context_tokens:
	description: "LLM token limit to apply to the solver"
	type: string
	required: false
	default: "8000"
	use_synthetic_tests:
	description: "Use synthetic tests"
	type: boolean
	required: false
	default: true
	observe_synthetic_tests:
	description: "Observe synthetic tests"
	type: boolean
	required: false
	default: false
	choose_code_files_only:
	description: "Choose code files only"
	type: boolean
	required: false
	default: false
	runner:
	description: "Runner type"
	required: true
	default: SWE-Bench_Larger
	type: choice
	options:
	- ubuntu-latest
	- swe-bench-ubuntu-latest
	- SWE-Bench_Larger
	num_runners:
	description: "Number of runners to split the workload across"
	required: true
	default: "2"
	name:
	description: "Assign a name to the workflow run"
	type: string
	required: false

	pull_request:
	types: [opened, synchronize, reopened, labeled]

	run-name: ${{ inputs.name \|\| github.event.pull_request.title \|\| github.event.workflow.name }}

	permissions:
	contents: read
	pull-requests: read
	packages: write

	jobs:
	show-inputs:
	runs-on: 'ubuntu-latest'
	steps:
	- name: Display Input Values
	run: \|
	echo "llm: ${{ github.event.inputs.llm }}"
	echo "instance_set: ${{ github.event.inputs.instance_set }}"
	echo "limits: ${{ github.event.inputs.limits }}"
	echo "context_tokens: ${{ github.event.inputs.context_tokens }}"
	echo "use_synthetic_tests: ${{ github.event.inputs.use_synthetic_tests }}"
	echo "observe_synthetic_tests: ${{ github.event.inputs.observe_synthetic_tests }}"
	echo "choose_code_files_only: ${{ github.event.inputs.choose_code_files_only }}"
	echo "runner: ${{ github.event.inputs.runner }}"
	echo "num_runners: ${{ github.event.inputs.num_runners }}"
	echo "name: ${{ github.event.inputs.name }}"

	build-appmap-js:
	uses: ./.github/workflows/build_appmap_js.yml

	prepare-matrix:
	runs-on: ubuntu-latest
	env:
	NUM_RUNNERS: ${{ inputs.num_runners }}
	outputs:
	matrix: ${{ steps.prepare-matrix.outputs.matrix }}
	steps:
	- name: Prepare matrix
	id: prepare-matrix
	run: \|
	num_runners=${NUM_RUNNERS:-2}
	echo "Number of runners: $num_runners"
	indices=$(seq 0 $(($num_runners - 1)) \| jq -R 'tonumber' \| jq -s -c)
	echo "Matrix: $indices"
	echo "matrix=$indices" >> $GITHUB_OUTPUT

	solve:
	if: ${{ contains(github.event.pull_request.labels.*.name, 'test-solve') \|\| github.event_name == 'workflow_dispatch' }}
	needs:
	- build-appmap-js
	- prepare-matrix
	runs-on: ${{ inputs.runner \|\| 'SWE-Bench_Larger' }}
	continue-on-error: true
	strategy:
	matrix:
	index: ${{ fromJson(needs['prepare-matrix'].outputs.matrix) }}
	env:
	NUM_RUNNERS: ${{ inputs.num_runners }}
	INSTANCE_SET: ${{ inputs.instance_set }}
	LIMITS: ${{ inputs.limits }}
	CONTEXT_TOKENS: ${{ inputs.context_tokens }}
	LLM: ${{ inputs.llm }}
	USE_SYNTHETIC_TESTS: ${{ inputs.use_synthetic_tests }}
	OBSERVE_SYNTHETIC_TESTS: ${{ inputs.observe_synthetic_tests }}
	CHOOSE_CODE_FILES_ONLY: ${{ inputs.choose_code_files_only }}
	defaults:
	run:
	shell: bash -leo pipefail {0}
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	submodules: true

	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: '3.12'

	# Restore the appmap-js build
	- name: Restore appmap-js build
	uses: actions/cache/restore@v4
	id: cache-appmap-js
	with:
	fail-on-cache-miss: true
	path: \|
	submodules/appmap-js/node_modules
	submodules/appmap-js/packages/*/built
	submodules/appmap-js/packages/*/dist
	submodules/appmap-js/packages/*/node_modules
	key: appmap-js-dist-${{ runner.os }}-${{ hashFiles('.git/modules/submodules/appmap-js/HEAD') }}

	- name: Set up Node.js
	if: steps.cache-appmap-js.outputs.cache-hit != 'true'
	uses: actions/setup-node@v3

	- name: Login to GitHub Container Registry
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Solve instances
	run: \|
	pip install virtualenv
	virtualenv venv
	. ./venv/bin/activate
	pip install -e .

	export PYTHONPATH=$PYTHONPATH:$(pwd)
	export APPMAP_COMMAND="node $(pwd)/submodules/appmap-js/packages/cli/built/cli.js"

	git config --global init.defaultBranch main
	git config --global user.email "[email protected]"
	git config --global user.name "GitHub Workflow"

	llm="${LLM:-gpt-4o}"
	export APPMAP_NAVIE_MODEL="${llm}"
	if [[ $llm == "gpt-"* \|\| $llm == "o1-"* ]]; then
	export OPENAI_API_KEY="${{ secrets.OPENAI_API_KEY }}"
	elif [[ $llm == "claude"* ]]; then
	export ANTHROPIC_API_KEY="${{ secrets.ANTHROPIC_API_KEY }}"
	elif [[ $llm == gemini* ]]; then
	export APPMAP_NAVIE_MINI_MODEL="${llm}"
	export GOOGLE_WEB_CREDENTIALS='${{ secrets.GOOGLE_WEB_CREDENTIALS}}'
	else
	echo "Unknown LLM model: $llm"
	exit 1
	fi

	instance_set="${INSTANCE_SET:-smoke}"
	context_tokens="${CONTEXT_TOKENS:-}"
	use_synthetic_tests="${USE_SYNTHETIC_TESTS:-true}"
	observe_synthetic_tests="${OBSERVE_SYNTHETIC_TESTS:-false}"
	choose_code_files_only="${CHOOSE_CODE_FILES_ONLY:-false}"
	limits="${LIMITS:-default}"
	num_runners="${NUM_RUNNERS:-2}"
	runner_index="${{ matrix.index }}"

	# When running the smoke test with "default" limits, remove some of the precomputed test patches to be sure
	# that the solver is actually solving the instances. This is used for testing the solver itself in CI.
	if [ "${instance_set}" == "smoke" ] && [ "${limits}" == "default" ]; then
	rm -f data/test_patches/pytest-dev__pytest-10051.json
	rm -f data/test_patches/django__django-14559.json
	fi

	# If context_tokens is not empty, prepend it to the limits variable
	if [ -n "${context_tokens}" ]; then
	# Clear limits if it's "default"
	[ "${limits}" == "default" ] && limits=""
	limits="context_tokens=${context_tokens} ${limits}"
	fi

	python -m solver.prepare_images \
	--instance_set "${instance_set}" \
	--num_runners "${num_runners}" \
	--runner_index "${runner_index}"

	if [ "${use_synthetic_tests}" == "false" ]; then
	echo Removing synthetic tests by deleting data/test_patches
	rm -rf data/test_patches
	fi

	python -m solver.solve \
	--instance_set "${instance_set}" \
	$( [ "${limits}" != "default" ] && echo "--limit ${limits}" ) \
	$( [ "${observe_synthetic_tests}" == "true" ] && echo "--observe_tests" ) \
	$( [ "${choose_code_files_only}" == "true" ] && echo "--choose_code_files_only" ) \
	--num_runners "${num_runners}" \
	--runner_index "${runner_index}"

	touch predictions.jsonl

	python -m swebench.harness.run_evaluation \
	--predictions_path predictions.jsonl \
	--run_id "${instance_set}"

	echo "Saving evaluation results"
	mkdir -p evaluations
	cp -r navie_*.${instance_set}.json evaluations/

	find evaluations

	- name: Report predictions
	uses: actions/upload-artifact@v4
	if: always()
	with:
	name: predictions-${{ matrix.index }}
	path: predictions.jsonl

	- name: Report harness logs
	uses: actions/upload-artifact@v4
	if: always()
	with:
	name: run_evaluation-${{ matrix.index }}
	path: logs/run_evaluation

	- name: Report solver logs
	uses: actions/upload-artifact@v4
	if: always()
	with:
	retention-days: 14
	name: solve-${{ matrix.index }}
	path: \|
	solve
	!solve/*/source

	report:
	needs:
	- solve
	runs-on: ubuntu-latest
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	submodules: true

	- name: Restore artifacts
	uses: actions/download-artifact@v4
	with:
	path: artifacts

	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: '3.12'

	- name: Install dependencies
	run: \|
	pip install virtualenv
	virtualenv venv
	. ./venv/bin/activate
	pip install -e .

	# Artifacts dir contains files like this:
	# artifacts/solve-0
	# artifacts/solve-0/scikit-learn__scikit-learn-13779
	# artifacts/solve-0/scikit-learn__scikit-learn-13779/navie
	# artifacts/solve-0/scikit-learn__scikit-learn-13779/navie/solution.json
	# artifacts/solve-0/scikit-learn__scikit-learn-13779/navie/plan
	# artifacts/solve-1
	# artifacts/solve-1/django__django-13779
	# ...
	# Create a solve directory and rsync the contents of each solve-* directory into it
	# solution.json files and other structured data that will be used by the report script is left as-is
	# If artifacts dir exists
	mkdir -p solve
	for dir in artifacts/solve-*; do
	rsync -a $dir/ solve/
	done

	# Artifacts dir contains files like this:
	# artifacts/run_evaluation-1
	# artifacts/run_evaluation-1/...
	# artifacts/run_evaluation-0
	# artifacts/run_evaluation-0/...
	# Roughly mimic the run_evaluation directory structure, except don't worry about the model name or run_id

	mkdir -p logs/run_evaluation
	any_run_evaluation_dir=$(ls -d artifacts/run_evaluation-* \| head -n 1)
	if [ -n "${any_run_evaluation_dir}" ]; then
	for dir in artifacts/run_evaluation-*; do
	rsync -a $dir/ logs/run_evaluation/
	done
	fi

	# Artifacts dir contains files like this:
	# logs/solve/evaluations-1
	# artifacts/evaluations-1/navie_082024+gpt-4o.smoke.json
	# artifacts/evaluations-0
	# artifacts/evaluations-0/navie_082024+gpt-4o.smoke.json
	# These can be left as-is, because being JSON they need to be combined by the report script

	# Predictions are available:
	# artifacts/predictions-1
	# artifacts/predictions-1/predictions.jsonl
	# artifacts/predictions-0
	# artifacts/predictions-0/predictions.jsonl
	# Concatenate these into a single file
	for file in artifacts/predictions-/.jsonl; do
	cat $file >> predictions.jsonl
	done

	- name: Report results
	run: \|
	. ./venv/bin/activate
	python -m solver.report \
	--predictions_path predictions.jsonl \
	--solve_data_dir solve \
	--evaluation_logs_dir logs/run_evaluation

	- name: Report solution.json files
	uses: actions/upload-artifact@v4
	with:
	name: solutions
	path: solve/**/solution.json

	- name: Report test_patch.json files
	uses: actions/upload-artifact@v4
	with:
	name: test-patch
	path: solve/**/test_patch.json

	- name: Upload report
	uses: actions/upload-artifact@v4
	with:
	name: report
	path: report.csv

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

test_fixup 3,3,3,3 64k sonnet #411

Workflow file

test_fixup 3,3,3,3 64k sonnet #411

Jobs

Run details

Workflow file for this run