From 0b872bc9044b75a23c83dd7904963e946bbd1fd6 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Tue, 5 Mar 2024 14:18:51 -0800 Subject: [PATCH] Leaderboard update: mistral-large-2402, gemini-1.0-pro and gemma 7-b; Update REST eval. (#237) Update leaderboard data and wagon wheel data to reflect new models and the updated rest API eval result. Change leaderboard to include `mistral-large-2402`, `gemini-1.0-pro`, and `gemma-7b`. Change leaderboard evals to be consistent with #234 new REST eval. --- index.js | 69 ++++++++++++--- leaderboard.html | 215 +++++++++++++++++++++++------------------------ 2 files changed, 164 insertions(+), 120 deletions(-) diff --git a/index.js b/index.js index c21a6eb04..bb955839a 100644 --- a/index.js +++ b/index.js @@ -142,7 +142,7 @@ const data = { datasets: [ { label: 'GPT-4-0125', - data: [87.50, 82.18, 90.00, 90.00, 91.00, 54.12, 76.00, 70.00, 55.00], + data: [87.50, 82.18, 90.00, 90.00, 91.00, 67.06, 76.00, 70.00, 55.00], fill: true, backgroundColor: 'rgba(255, 206, 86, 0.1)', borderColor: 'rgb(255, 206, 86)', @@ -152,7 +152,7 @@ const data = { pointHoverBorderColor: 'rgb(255, 206, 86)' }, { label: 'GPT-4-1106', - data: [88.75, 81.64, 92.00, 89.50, 92.00, 53.53, 72.00, 62.00, 50.00], + data: [88.75, 81.64, 92.00, 89.50, 92.00, 70.00, 72.00, 62.00, 50.00], fill: true, backgroundColor: 'rgba(75, 192, 192, 0.1)', borderColor: 'rgb(75, 192, 192)', @@ -163,7 +163,7 @@ const data = { hidden: true }, { label: 'OpenFunctions-v2', - data: [71.67, 88.73, 79.50, 89.50, 78.00, 78.82, 76.00, 74.00, 60.00], + data: [71.67, 88.73, 79.50, 89.50, 78.00, 80.00, 76.00, 74.00, 60.00], fill: true, backgroundColor: 'rgba(153, 102, 255, 0.1)', borderColor: 'rgb(153, 102, 255)', @@ -173,7 +173,7 @@ const data = { pointHoverBorderColor: 'rgb(153, 102, 255)' }, { label: 'GPT-3.5-Turbo', - data: [68.33, 81.27, 87.50, 88.00, 88.00, 74.12, 70.00, 74.00, 47.50], + data: [68.33, 81.27, 87.50, 88.00, 88.00, 80.00, 70.00, 74.00, 47.50], fill: true, backgroundColor: 'rgba(255, 159, 64, 0.1)', borderColor: 'rgb(255, 159, 64)', @@ -184,7 +184,7 @@ const data = { hidden: true }, { label: 'Mistral-medium', - data: [90.00, 80.18, 71.00, 84.50, 68.00, 75.88, 62.00, 72.00, 47.50], + data: [90.00, 80.18, 71.00, 84.50, 68.00, 78.24, 62.00, 72.00, 47.50], fill: true, backgroundColor: 'rgba(54, 162, 235, 0.1)', borderColor: 'rgb(54, 162, 235)', @@ -195,7 +195,7 @@ const data = { hidden: true }, { label: 'Claude-2.1', - data: [78.33, 85.64, 72.00, 83.00, 56.50, 61.18, 60.00, 48.00, 45.00], + data: [78.33, 85.64, 72.00, 83.00, 56.50, 63.53, 60.00, 48.00, 45.00], fill: true, backgroundColor: 'rgba(163, 73, 164, 0.1)', borderColor: 'rgb(163, 73, 164)', @@ -206,7 +206,7 @@ const data = { hidden: true }, { label: 'Mistral-tiny', - data: [77.08, 59.27, 53.50, 59.50, 41.50, 58.24, 42.00, 64.00, 40.00], + data: [77.08, 59.27, 53.50, 59.50, 41.50, 63.53, 42.00, 64.00, 40.00], fill: true, backgroundColor: 'rgba(255, 105, 180, 0.1)', borderColor: 'rgb(255, 105, 180)', @@ -217,7 +217,7 @@ const data = { hidden: true }, { label: 'Claude-instant', - data: [61.67, 68.73, 53.00, 59.00, 39.50, 51.76, 50.00, 52.00, 37.50], + data: [61.67, 68.73, 53.00, 59.00, 39.50, 56.47, 50.00, 52.00, 37.50], fill: true, backgroundColor: 'rgba(255, 165, 0, 0.1)', borderColor: 'rgb(255, 165, 0)', @@ -228,7 +228,7 @@ const data = { hidden: true }, { label: 'Nexusflow-Raven-v2', - data: [0.00, 76.55, 39.50, 83.50, 34.00, 45.88, 68.00, 78.00, 45.00], + data: [0.00, 76.55, 39.50, 83.50, 34.00, 58.24, 68.00, 78.00, 45.00], fill: true, backgroundColor: 'rgba(60, 179, 113, 0.1)', borderColor: 'rgb(60, 179, 113)', @@ -239,7 +239,7 @@ const data = { hidden: true }, { label: 'Mistral-small', - data: [89.58, 46.55, 48.50, 68.00, 58.00, 14.12, 40.00, 30.00, 37.50], + data: [89.58, 46.55, 48.50, 68.00, 58.00, 32.35, 40.00, 30.00, 37.50], fill: true, backgroundColor: 'rgba(0, 0, 255, 0.1)', borderColor: 'rgb(0, 0, 255)', @@ -250,7 +250,7 @@ const data = { hidden: true }, { label: 'GPT-4-0613', - data: [87.08, 74.55, 4.00, 86.00, 0.00, 37.65, 0.00, 50.00, 0.00], + data: [87.08, 74.55, 4.00, 86.00, 0.00, 44.12, 0.00, 50.00, 0.00], fill: true, backgroundColor: 'rgba(128, 0, 0, 0.1)', borderColor: 'rgb(128, 0, 0)', @@ -261,7 +261,7 @@ const data = { hidden: true }, { label: 'Deepseek-v1.5', - data: [66.25, 48.36, 35.00, 61.00, 43.50, 5.29, 0.00, 2.00, 7.50], + data: [66.25, 48.36, 35.00, 61.00, 43.50, 24.70, 0.00, 2.00, 7.50], fill: true, backgroundColor: 'rgba(255, 215, 0, 0.1)', borderColor: 'rgb(255, 215, 0)', @@ -292,8 +292,53 @@ const data = { pointHoverBackgroundColor: '#fff', pointHoverBorderColor: 'rgb(255, 99, 132)', hidden: true + }, { + label: 'Gemini-1.0-Pro', + data: [77.50, 78.43, 89, 4.00, 0.00, 63.77, 62.00, 0.00, 0.00], + fill: true, + backgroundColor: 'rgba(218, 112, 214, 0.1)', + borderColor: 'rgb(218, 112, 214)', + pointBackgroundColor: 'rgb(218, 112, 214)', + pointBorderColor: '#fff', + pointHoverBackgroundColor: '#fff', + pointHoverBorderColor: 'rgb(218, 112, 214)', + hidden: true + }, { + label: 'Mistral-large-2402', + data: [84.58, 71.82, 90.50, 4.00, 0.00, 67.06, 66.00, 0.00, 5.00], + fill: true, + backgroundColor: 'rgba(65, 105, 225, 0.1)', + borderColor: 'rgb(65, 105, 225)', + pointBackgroundColor: 'rgb(65, 105, 225)', + pointBorderColor: '#fff', + pointHoverBackgroundColor: '#fff', + pointHoverBorderColor: 'rgb(65, 105, 225)', + hidden: true + }, { + label: 'Firefunction-v1', + data: [81.25, 73.19, 87.00, 4.00, 0.00, 61.76, 64.00, 0.00, 5.00], + fill: true, + backgroundColor: 'rgba(0, 255, 255, 0.1)', + borderColor: 'rgb(0, 255, 255)', + pointBackgroundColor: 'rgb(0, 255, 255)', + pointBorderColor: '#fff', + pointHoverBackgroundColor: '#fff', + pointHoverBorderColor: 'rgb(0, 255, 255)', + hidden: true + }, { + label: 'Gemma', + data: [0.42, 61.45, 60.00, 41.00, 32.50, 44.71, 46.00, 44.00, 25.50], + fill: true, + backgroundColor: 'rgba(85, 107, 47, 0.1)', + borderColor: 'rgb(85, 107, 47)', + pointBackgroundColor: 'rgb(85, 107, 47)', + pointBorderColor: '#fff', + pointHoverBackgroundColor: '#fff', + pointHoverBorderColor: 'rgb(85, 107, 47)', + hidden: true }, + ] }; diff --git a/leaderboard.html b/leaderboard.html index d18075996..d4ba2c317 100644 --- a/leaderboard.html +++ b/leaderboard.html @@ -121,27 +121,6 @@

Leaderboard

1 - 83.80 - - GPT-4-0125-Preview - - OpenAI - Proprietary - 88.30 - 63.78 - 87.50 - 82.18 - 90.00 - 90.00 - 91.00 - 54.12 - 70.00 - 76.00 - 55.00 - 87.50 - - - 2 83.55 GPT-4-1106-Preview @@ -149,18 +128,39 @@

Leaderboard

OpenAI Proprietary 88.78 - 59.38 + 63.50 88.75 81.64 89.50 92.00 92.00 - 53.53 + 70.00 62.00 72.00 50.00 88.75 + + 2 + 83.80 + + GPT-4-0125-Preview + + OpenAI + Proprietary + 88.30 + 67.02 + 87.50 + 82.18 + 90.00 + 90.00 + 91.00 + 67.06 + 70.00 + 76.00 + 55.00 + 87.50 + 3 83.55 @@ -170,13 +170,13 @@

Leaderboard

Gorilla LLM Apache 2.0 83.93 - 72.20 + 72.50 71.67 88.73 89.50 79.50 78.00 - 78.82 + 80.00 74.00 76.00 60.00 @@ -191,13 +191,13 @@

Leaderboard

OpenAI Proprietary 86.19 - 66.41 + 67.88 68.33 81.27 88.00 87.50 88.00 - 74.12 + 80.00 74.00 70.00 47.50 @@ -212,13 +212,13 @@

Leaderboard

Mistral AI Proprietary 75.92 - 64.34 + 64.94 90.00 80.18 84.50 71.00 68.00 - 75.88 + 78.24 72.00 62.00 47.50 @@ -233,13 +233,13 @@

Leaderboard

Anthropic Proprietary 74.28 - 53.55 + 54.13 78.33 85.64 83.00 72.00 56.50 - 61.18 + 63.53 48.00 60.00 45.00 @@ -254,13 +254,13 @@

Leaderboard

Mistral AI Proprietary 53.44 - 51.06 + 52.38 77.08 59.27 59.50 53.50 41.50 - 58.24 + 63.53 64.00 42.00 40.00 @@ -275,13 +275,13 @@

Leaderboard

Anthropic Proprietary 55.06 - 47.81 + 48.99 61.67 68.73 59.00 53.00 39.50 - 51.76 + 56.47 52.00 50.00 37.50 @@ -289,48 +289,69 @@

Leaderboard

9 + 54.99 + + Gemini-1.0-Pro + + Google + Proprietary + 42.86 + 31.44 + 77.50 + 78.43 + 89 + 4.00 + 0.00 + 63.77 + 62.00 + 0.00 + 0.00 + 77.50 + + + 10 55.84 - Mistral-large + Mistral-large-2402 Mistral AI Proprietary 41.58 - 33.19 + 34.52 84.58 71.82 90.50 4.00 0.00 - 61.76 + 67.06 66.00 0.00 5.00 84.58 - 10 - 54.99 + 11 + 53.86 - Gemini-1.0-Pro + Mistral-small - Google + Mistral AI Proprietary - 42.86 - 27.03 - 77.50 - 78.43 - 89 - 4.00 - 0.00 - 46.12 - 62.00 - 0.00 - 0.00 - 77.50 + 55.26 + 34.96 + 89.58 + 46.55 + 68.00 + 48.50 + 58.00 + 32.35 + 30.00 + 40.00 + 37.50 + 89.58 - 11 + 12 54.46 Nexusflow-Raven-v2 @@ -338,20 +359,20 @@

Leaderboard

Nexusflow Apache 2.0 58.39 - 59.22 + 62.31 0.00 76.55 83.50 39.50 34.00 - 45.88 + 58.24 78.00 68.00 45.00 0.00 - 12 + 13 53.95 Firefunction-v1 @@ -359,39 +380,18 @@

Leaderboard

Fireworks-ai Apache 2.0 41.05 - 29.31 + 32.69 81.25 73.19 87.00 4.00 0.00 - 48.24 + 61.76 64.00 0.00 5.00 81.25 - - 13 - 53.86 - - Mistral-small - - Mistral AI - Proprietary - 55.26 - 30.41 - 89.58 - 46.55 - 68.00 - 48.50 - 58.00 - 14.12 - 30.00 - 40.00 - 37.50 - 89.58 - 14 53.49 @@ -401,13 +401,13 @@

Leaderboard

OpenAI Proprietary 41.14 - 21.91 + 23.53 87.08 74.55 86.00 4.00 0.00 - 37.65 + 44.12 50.00 0.00 0.00 @@ -415,27 +415,6 @@

Leaderboard

15 - 44.46 - - Gemma - - Google - gemma-term-of-use - 48.74 - 40.34 - 0.42 - 61.45 - 60.00 - 41.00 - 32.50 - 45.88 - 46.00 - 44.00 - 25.50 - 0.42 - - - 16 43.19 Deepseek-v1.5 @@ -443,18 +422,39 @@

Leaderboard

Deepseek Deepseek License 46.97 - 3.7 + 8.55 66.25 48.36 61.00 35.00 43.50 - 5.29 + 24.70 2.00 0.00 7.50 66.25 + + 16 + 44.46 + + Gemma + + Google + gemma-term-of-use + 48.74 + 40.05 + 0.42 + 61.45 + 60.00 + 41.00 + 32.50 + 44.71 + 46.00 + 44.00 + 25.50 + 0.42 + 17 33.61 @@ -581,8 +581,7 @@

Function Calling Demo

Output will be shown here:
-
OpenAI compatible format output - here:
+
OpenAI compatible format output here: