Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding code to benchmark different models on a set of Singaporean math problems #12

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ huggingface-hub==0.22.2
idna==3.7
iniconfig==2.0.0
inline-snapshot==0.8.0
instructor==1.4.0
instructor==1.4.3
datasets==2.21.0
langsmith==0.1.48
markdown-it-py==3.0.0
mdurl==0.1.2
Expand Down Expand Up @@ -60,3 +61,7 @@ typing-extensions==4.11.0
tzdata==2024.1
urllib3==2.2.1
yarl==1.9.4
boto3==1.35.29
autoevals==0.0.87
braintrust==0.0.155
google-generativeai==0.5.4
222 changes: 222 additions & 0 deletions scripts/evaluate_psle_math.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
from datasets import load_dataset
from asyncio import run
from braintrust import Eval
import instructor
from autoevals.value import ExactMatch
from typing import Union, Literal
import google.generativeai as genai
from pydantic import BaseModel, ConfigDict
import json
from PIL.PngImagePlugin import PngImageFile
from anthropic import AsyncAnthropic
from openai import AsyncOpenAI
import base64
import httpx


class Fraction(BaseModel):
whole: int
numerator: int
denominator: int


class MultipleChoice(BaseModel):
choice: Literal[1, 2, 3, 4]


class Number(BaseModel):
value: float


class EvaluationItem(BaseModel):
input: PngImageFile
expected: Union[Fraction, MultipleChoice, Number]
metadata: dict[str, str]

model_config = ConfigDict(arbitrary_types_allowed=True)


class MultipleChoiceResponse(BaseModel):
chain_of_thought: str
answer: MultipleChoice


class Response(BaseModel):
chain_of_thought: str
answer: Union[Fraction, Number]


def format_dataset_braintrust(dataset):
for row in dataset:
row = {
**row["objects"],
"image": row["image"],
"data": json.loads(row["objects"]["data"]),
}
if row["type"] == "multiple_choice":
expected_output = MultipleChoice(choice=row["data"]["choice"])
elif row["type"] == "number":
expected_output = Number(value=row["data"]["value"])
elif row["type"] == "fraction":
expected_output = Fraction(
whole=row["data"]["whole"],
numerator=row["data"]["numerator"],
denominator=row["data"]["denominator"],
)
yield EvaluationItem(
input=row["image"],
expected=expected_output,
metadata={"id": row["id"], "type": row["type"]},
)


def generate_questions():
for row in format_dataset_braintrust(load_dataset("567-labs/psle-math")["train"]):
yield row


def get_client(provider: Literal["gemini", "openai", "anthropic"], model: str = ""):
if provider == "gemini":
return instructor.from_gemini(
genai.GenerativeModel(model_name=model),
mode=instructor.Mode.GEMINI_JSON,
use_async=True,
)
elif provider == "openai":
return instructor.from_openai(AsyncOpenAI())
elif provider == "anthropic":
return instructor.from_anthropic(AsyncAnthropic())


def get_response_model(type: Literal["multiple_choice", "number", "fraction"]):
if type == "multiple_choice":
return MultipleChoiceResponse
else:
return Response


async def generate_gemini_response(client, input, hooks):
response_model = get_response_model(input["type"])

resp = await client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Solve the following question. Make sure to think through your answer step by step before you provide the final answer",
},
{
"role": "user",
"content": input["image"],
},
],
response_model=response_model,
)
hooks.meta(
chain_of_thought=resp.chain_of_thought,
)
return resp.answer


async def generate_openai_response(client, input, hooks):
response_model = get_response_model(input["type"])

resp = await client.chat.completions.create(
model=input["model"],
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Solve the following question. Make sure to think through your answer step by step before you provide the final answer",
},
{
"type": "image_url",
"image_url": {
"url": input["image_url"]["url"],
},
},
],
}
],
response_model=response_model,
)
hooks.meta(
chain_of_thought=resp.chain_of_thought,
)
return resp.answer


async def generate_anthropic_response(client, input, hooks):
response_model = get_response_model(input["type"])

resp = await client.messages.create(
model=input["model"],
max_tokens=4096,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image."},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": base64.b64encode(
httpx.get(input["image_url"]["url"]).content
).decode("utf-8"),
},
},
],
},
],
response_model=response_model,
)
hooks.meta(
chain_of_thought=resp.chain_of_thought,
)
return resp.answer


async def main():
provider = "anthropic"
model = "claude-3-5-sonnet-20240620"

client = get_client(provider, model)
dataset = list(generate_questions())

async def task(input, hooks):
if provider == "gemini":
return await generate_gemini_response(client, input, hooks)
elif provider == "openai":
return await generate_openai_response(client, input, hooks)
elif provider == "anthropic":
return await generate_anthropic_response(client, input, hooks)

await Eval(
name="567-labs/psle-math-evals",
data=[
{
"input": {
"image_url": {
"url": f"https://r2-worker.evals.workers.dev/{row.metadata['id']}.png",
},
"image": row.input,
"type": row.metadata["type"],
"model": model,
},
"expected": row.expected,
"metadata": row.metadata,
}
for row in dataset
],
task=task,
scores=[ExactMatch],
metadata={"model": model, "provider": provider},
max_concurrency=10,
)


if __name__ == "__main__":
run(main())
64 changes: 64 additions & 0 deletions scripts/process_math.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import json
import os
from datasets import load_dataset
import boto3
from tqdm import tqdm
import io


def extract_relevant_files(parent_path: str):
metadata = {}

for root, _, files in os.walk(parent_path):
for file in files:
if file == "solution.json":
solution_path = os.path.abspath(os.path.join(root, "solution.json"))
with open(solution_path, "r") as f:
solutions = json.load(f)

for solution in solutions:
metadata[solution["id"]] = {
"file_name": os.path.abspath(
os.path.join(
os.path.dirname(solution_path), solution["id"] + ".png"
)
),
"objects": {
"data": json.dumps(solution["data"]),
"type": solution["type"],
"id": solution["id"],
},
}

return metadata


def upload_images_to_r2(bucket: str, folder_name: str):
for root, _, files in os.walk(folder_name):
for file in tqdm(files, desc="Uploading images to R2"):
if file.endswith(".png"):
file_path = os.path.join(root, file)
with open(file_path, "rb") as file_content:
s3.upload_fileobj(io.BytesIO(file_content.read()), bucket, file)


if __name__ == "__main__":
bucket_name = "math-evals"
folder_name = "./scripts/data"

s3 = boto3.client(
service_name="s3",
endpoint_url=f"https://{os.environ['CLOUDFLARE_ACCOUNT_ID']}.r2.cloudflarestorage.com",
aws_access_key_id=os.environ["CLOUDFLARE_ACCESS_KEY_ID"],
aws_secret_access_key=os.environ["CLOUDFLARE_SECRET_ACCESS_KEY_ID"],
region_name="auto",
)
# upload_images_to_r2("math-evals", folder_name)
metadata = extract_relevant_files(folder_name)
metadata_file_path = os.path.join(folder_name, "metadata.jsonl")
with open(metadata_file_path, "w") as metadata_file:
for entry in metadata.values():
metadata_file.write(json.dumps(entry) + "\n")

dataset = load_dataset("imagefolder", data_dir=folder_name)
dataset.push_to_hub("567-labs/psle-math")