diff --git a/docusaurus.config.ts b/docusaurus.config.ts index 709646b..43050b9 100644 --- a/docusaurus.config.ts +++ b/docusaurus.config.ts @@ -98,6 +98,10 @@ const config: Config = { label: 'Craft vs Cruft', href: 'https://www.youtube.com/channel/UC4nEbAo5xFsOZDk2v0RIGHA', }, + { + label: 'More', + href: '/resources', + }, ], }, { @@ -108,13 +112,14 @@ const config: Config = { // href: 'https://stackoverflow.com/questions/tagged/docusaurus', // }, { - label: 'nopilot.dev Discord', + label: 'Nopilot Discord', href: 'https://discord.gg/k3hzFm5ykA', }, { - label: 'Resources', - href: '/resources', + label: 'Nopilot YouTube', + href: 'https://www.youtube.com/@nopilot-dev', }, + // { // label: 'Twitter', // href: 'https://twitter.com/docusaurus', diff --git a/src/components/HomepageFeatures/index.tsx b/src/components/HomepageFeatures/index.tsx index 5459085..6f9c8b8 100644 --- a/src/components/HomepageFeatures/index.tsx +++ b/src/components/HomepageFeatures/index.tsx @@ -128,11 +128,14 @@ export default function HomepageFeatures(): JSX.Element {
Updates
-

Blog: Dissecting Devin

+ +
+
+
- - -
diff --git a/src/pages/leaderboards.mdx b/src/pages/leaderboards.mdx index 2a4655e..6b4f6ad 100644 --- a/src/pages/leaderboards.mdx +++ b/src/pages/leaderboards.mdx @@ -20,6 +20,30 @@ ML researcher [theblackcat102](https://github.com/theblackcat102) [reports](http Paul Gauthier [points out](https://github.com/princeton-nlp/SWE-bench/issues/72) that some SWE-bench cases appear to be underspecified and effectively impossible to solve because the tests rely on implementation detail. It's unclear what the maximum possible score is. +## Aider Leaderboards + +The coding agent Aider maintains a [leaderboard](https://aider.chat/docs/leaderboards) of model performance within its key subtasks. + +### Code Editing + +- openai/gpt-4o +- claude-3-opus +- gpt-4 (0613) +- gpt-4-turbo (2024-04-09) +- deepseek-chat v2 (Open Weight) +- gpt-3.5-turbo +- gemini-1.5-pro +- claude-3-sonnet +- deepseek-coder (Open Weight) + +### Code refactoring + +- claude-3-opus +- openai/gpt-4o +- gpt-4 (1106-preview) +- gemini-1.5-pro +- gpt-4-turbo (2024-04-09) + ## LiveCodeBench [LiveCodeBench](https://livecodebench.github.io/leaderboard.html): "Holistic and Contamination Free Evaluation of Large Language Models for Code" @@ -31,10 +55,15 @@ Tests the strength of models across different coding sub-tasks. * Test Output Prediction * Code Execution -*Last checked: 2024-04-10* -* Proprietary Leaders: GPT-4-Turbo-2024-04-09, Claude-3-Opus -* Open Weight Leaders: [WizardCoder-33B-V1.1](https://huggingface.co/WizardLM/WizardCoder-33B-V1.1), [deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct), [CodeLlama-34b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) +The below listing of standout models across subtasks is subjective. +*Last checked: 2024-05-14* +* Proprietary Leaders: GPT-4o, GPT-4-Turbo, Claude-3-Opus +* Open Weight Leaders: + * [LLama3-70b-Ins](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) + * [WizardCoder-33B-V1.1](https://huggingface.co/WizardLM/WizardCoder-33B-V1.1) + * [deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct) + * [Phind-34B-V2](https://huggingface.co/Phind/Phind-CodeLlama-34B-v2) ## Other notable benchmarks diff --git a/src/pages/resources.md b/src/pages/resources.md index 8a42a1c..d3bc459 100644 --- a/src/pages/resources.md +++ b/src/pages/resources.md @@ -6,25 +6,31 @@ title: Resources ## Community -* [Can we beat Devin? Discord](https://discord.gg/canwebeatdevin): shared space with several teams +* [nopilot.dev Discord](https://discord.gg/k3hzFm5ykA) - Discussion about the ecosystem * [OpenDevin Discord](https://discord.gg/mBuDGRzzES) -* [nopilot.dev Discord](https://discord.gg/k3hzFm5ykA): discussion about this hub +* [SWE-agent Discord](https://discord.gg/AVEFbBn2rH) +* OpenDevAI Discord -## Autonomous Coders (WebUX) +## Videos +* [nopilot.dev YouTube Channel](https://www.youtube.com/@nopilot-dev) +* [Playlist on Autonomous DevTools](https://www.youtube.com/playlist?list=PLUBjHzmgsFNf_9LrJlk2t0n7pGiOLVqoX) + +## Coding Agents (WebUX) -* Devin by Cognition * [OpenDevin](https://github.com/OpenDevin/OpenDevin) +* Devin by Cognition * [Devika](https://github.com/stitionai/devika) * [Anterion](https://github.com/MiscellaneousStuff/anterion): UX wrapping SWE-agent -## Autonomous Coders (Command-line) +## Coding Agents (Backend) * [AutoCodeRover](https://github.com/nus-apr/auto-code-rover): from NUS-apr, highest score on SWE-bench lite * [SWE-agent](https://swe-agent.com) from Princeton NLP, first Open Source agent to break 10% SWE-bench +* [Sweep](https://sweep.dev): Turn bugs into pull requests [Longer list](https://github.com/e2b-dev/awesome-ai-agents) by E2B. ## Eval Tools * [SWE-bench](https://www.swebench.com/) * [moatless-tools](https://github.com/aorwall/moatless-tools) -* [SWE-bench-util](https://github.com/raymyers/swe-bench-util) +