From bcabd6d6bbde256e2a7bc3eee7aeb03afe83d00c Mon Sep 17 00:00:00 2001 From: Ray Myers Date: Wed, 10 Apr 2024 19:02:11 -0500 Subject: [PATCH] Add leaderboard --- docusaurus.config.ts | 1 + src/pages/leaderboards.md | 44 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 src/pages/leaderboards.md diff --git a/docusaurus.config.ts b/docusaurus.config.ts index a94266f..12fbcda 100644 --- a/docusaurus.config.ts +++ b/docusaurus.config.ts @@ -71,6 +71,7 @@ const config: Config = { // label: 'Tutorial', // }, {to: '/blog', label: 'News', position: 'left'}, + {to: '/leaderboards', label: 'Leaderboards', position: 'left'}, { href: 'https://github.com/facebook/docusaurus', label: 'GitHub', diff --git a/src/pages/leaderboards.md b/src/pages/leaderboards.md new file mode 100644 index 0000000..cdd876d --- /dev/null +++ b/src/pages/leaderboards.md @@ -0,0 +1,44 @@ +--- +title: Leaderboards +--- + +# Leaderboards + +## SWE-bench +**The gold standard**. Released in September 2023 by Princeton NLP, SWE-bench is the most widely accepted measure of an agent's ability to solve tasks in a realistic codebase. + + + +*Last checked: 2024-04-10* +| Rank | Agent | Score | Score (lite) | Status | Group | License | +| ---- | -------------------- | ------ | ------------ | ----------------- | ------------ | ----------------------- | +| 1 | [auto-code-rover](https://github.com/nus-apr/auto-code-rover) | - | 22.3% | Reported | APR@NUS | GPL-3 | +| 2 | [SWE-agent](https://swe-agent.com/) + GPT 4 | 12.29% | 17% | Official | Proprietary | MIT | +| 3 | Devin | 13.48% | - | Reported, sample | Cognition | Proprietary | + + + + +An "unassisted" score means the agent is told which files need to be modified. + +## LiveCodeBench + +[LiveCodeBench](https://livecodebench.github.io/leaderboard.html): "Holistic and Contamination Free Evaluation of Large Language Models for Code" + +Tests the strength of models across different coding sub-tasks. + +* Code Generation +* Self-Repair +* Test Output Prediction +* Code Execution + +*Last checked: 2024-04-10* +* Proprietary Leaders: GPT-4-Turbo-2024-04-09, Claude-3-Opus +* Open Weight Leaders: [WizardCoder-33B-V1.1](https://huggingface.co/WizardLM/WizardCoder-33B-V1.1), [deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct), [CodeLlama-34b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) + + + + +## HumanEval + +[Link](https://paperswithcode.com/sota/code-generation-on-humaneval) \ No newline at end of file