From e2ea4e7b311f7a88d2ecdd21639fd29061f6758e Mon Sep 17 00:00:00 2001 From: Ivan Leo Date: Sun, 29 Sep 2024 21:48:06 +0800 Subject: [PATCH] Added requirements.txt --- requirements.txt | 7 +- scripts/evaluate_psle_math.py | 222 ++++++++++++++++++++++++++++++++++ scripts/process_math.py | 64 ++++++++++ 3 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 scripts/evaluate_psle_math.py create mode 100644 scripts/process_math.py diff --git a/requirements.txt b/requirements.txt index 13ec322..f106315 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,8 @@ huggingface-hub==0.22.2 idna==3.7 iniconfig==2.0.0 inline-snapshot==0.8.0 -instructor==1.4.0 +instructor==1.4.3 +datasets==2.21.0 langsmith==0.1.48 markdown-it-py==3.0.0 mdurl==0.1.2 @@ -60,3 +61,7 @@ typing-extensions==4.11.0 tzdata==2024.1 urllib3==2.2.1 yarl==1.9.4 +boto3==1.35.29 +autoevals==0.0.87 +braintrust==0.0.155 +google-generativeai==0.5.4 diff --git a/scripts/evaluate_psle_math.py b/scripts/evaluate_psle_math.py new file mode 100644 index 0000000..aa75155 --- /dev/null +++ b/scripts/evaluate_psle_math.py @@ -0,0 +1,222 @@ +from datasets import load_dataset +from asyncio import run +from braintrust import Eval +import instructor +from autoevals.value import ExactMatch +from typing import Union, Literal +import google.generativeai as genai +from pydantic import BaseModel, ConfigDict +import json +from PIL.PngImagePlugin import PngImageFile +from anthropic import AsyncAnthropic +from openai import AsyncOpenAI +import base64 +import httpx + + +class Fraction(BaseModel): + whole: int + numerator: int + denominator: int + + +class MultipleChoice(BaseModel): + choice: Literal[1, 2, 3, 4] + + +class Number(BaseModel): + value: float + + +class EvaluationItem(BaseModel): + input: PngImageFile + expected: Union[Fraction, MultipleChoice, Number] + metadata: dict[str, str] + + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class MultipleChoiceResponse(BaseModel): + chain_of_thought: str + answer: MultipleChoice + + +class Response(BaseModel): + chain_of_thought: str + answer: Union[Fraction, Number] + + +def format_dataset_braintrust(dataset): + for row in dataset: + row = { + **row["objects"], + "image": row["image"], + "data": json.loads(row["objects"]["data"]), + } + if row["type"] == "multiple_choice": + expected_output = MultipleChoice(choice=row["data"]["choice"]) + elif row["type"] == "number": + expected_output = Number(value=row["data"]["value"]) + elif row["type"] == "fraction": + expected_output = Fraction( + whole=row["data"]["whole"], + numerator=row["data"]["numerator"], + denominator=row["data"]["denominator"], + ) + yield EvaluationItem( + input=row["image"], + expected=expected_output, + metadata={"id": row["id"], "type": row["type"]}, + ) + + +def generate_questions(): + for row in format_dataset_braintrust(load_dataset("567-labs/psle-math")["train"]): + yield row + + +def get_client(provider: Literal["gemini", "openai", "anthropic"], model: str = ""): + if provider == "gemini": + return instructor.from_gemini( + genai.GenerativeModel(model_name=model), + mode=instructor.Mode.GEMINI_JSON, + use_async=True, + ) + elif provider == "openai": + return instructor.from_openai(AsyncOpenAI()) + elif provider == "anthropic": + return instructor.from_anthropic(AsyncAnthropic()) + + +def get_response_model(type: Literal["multiple_choice", "number", "fraction"]): + if type == "multiple_choice": + return MultipleChoiceResponse + else: + return Response + + +async def generate_gemini_response(client, input, hooks): + response_model = get_response_model(input["type"]) + + resp = await client.chat.completions.create( + messages=[ + { + "role": "user", + "content": "Solve the following question. Make sure to think through your answer step by step before you provide the final answer", + }, + { + "role": "user", + "content": input["image"], + }, + ], + response_model=response_model, + ) + hooks.meta( + chain_of_thought=resp.chain_of_thought, + ) + return resp.answer + + +async def generate_openai_response(client, input, hooks): + response_model = get_response_model(input["type"]) + + resp = await client.chat.completions.create( + model=input["model"], + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Solve the following question. Make sure to think through your answer step by step before you provide the final answer", + }, + { + "type": "image_url", + "image_url": { + "url": input["image_url"]["url"], + }, + }, + ], + } + ], + response_model=response_model, + ) + hooks.meta( + chain_of_thought=resp.chain_of_thought, + ) + return resp.answer + + +async def generate_anthropic_response(client, input, hooks): + response_model = get_response_model(input["type"]) + + resp = await client.messages.create( + model=input["model"], + max_tokens=4096, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image."}, + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": base64.b64encode( + httpx.get(input["image_url"]["url"]).content + ).decode("utf-8"), + }, + }, + ], + }, + ], + response_model=response_model, + ) + hooks.meta( + chain_of_thought=resp.chain_of_thought, + ) + return resp.answer + + +async def main(): + provider = "anthropic" + model = "claude-3-5-sonnet-20240620" + + client = get_client(provider, model) + dataset = list(generate_questions()) + + async def task(input, hooks): + if provider == "gemini": + return await generate_gemini_response(client, input, hooks) + elif provider == "openai": + return await generate_openai_response(client, input, hooks) + elif provider == "anthropic": + return await generate_anthropic_response(client, input, hooks) + + await Eval( + name="567-labs/psle-math-evals", + data=[ + { + "input": { + "image_url": { + "url": f"https://r2-worker.evals.workers.dev/{row.metadata['id']}.png", + }, + "image": row.input, + "type": row.metadata["type"], + "model": model, + }, + "expected": row.expected, + "metadata": row.metadata, + } + for row in dataset + ], + task=task, + scores=[ExactMatch], + metadata={"model": model, "provider": provider}, + max_concurrency=10, + ) + + +if __name__ == "__main__": + run(main()) diff --git a/scripts/process_math.py b/scripts/process_math.py new file mode 100644 index 0000000..42fc03b --- /dev/null +++ b/scripts/process_math.py @@ -0,0 +1,64 @@ +import json +import os +from datasets import load_dataset +import boto3 +from tqdm import tqdm +import io + + +def extract_relevant_files(parent_path: str): + metadata = {} + + for root, _, files in os.walk(parent_path): + for file in files: + if file == "solution.json": + solution_path = os.path.abspath(os.path.join(root, "solution.json")) + with open(solution_path, "r") as f: + solutions = json.load(f) + + for solution in solutions: + metadata[solution["id"]] = { + "file_name": os.path.abspath( + os.path.join( + os.path.dirname(solution_path), solution["id"] + ".png" + ) + ), + "objects": { + "data": json.dumps(solution["data"]), + "type": solution["type"], + "id": solution["id"], + }, + } + + return metadata + + +def upload_images_to_r2(bucket: str, folder_name: str): + for root, _, files in os.walk(folder_name): + for file in tqdm(files, desc="Uploading images to R2"): + if file.endswith(".png"): + file_path = os.path.join(root, file) + with open(file_path, "rb") as file_content: + s3.upload_fileobj(io.BytesIO(file_content.read()), bucket, file) + + +if __name__ == "__main__": + bucket_name = "math-evals" + folder_name = "./scripts/data" + + s3 = boto3.client( + service_name="s3", + endpoint_url=f"https://{os.environ['CLOUDFLARE_ACCOUNT_ID']}.r2.cloudflarestorage.com", + aws_access_key_id=os.environ["CLOUDFLARE_ACCESS_KEY_ID"], + aws_secret_access_key=os.environ["CLOUDFLARE_SECRET_ACCESS_KEY_ID"], + region_name="auto", + ) + # upload_images_to_r2("math-evals", folder_name) + metadata = extract_relevant_files(folder_name) + metadata_file_path = os.path.join(folder_name, "metadata.jsonl") + with open(metadata_file_path, "w") as metadata_file: + for entry in metadata.values(): + metadata_file.write(json.dumps(entry) + "\n") + + dataset = load_dataset("imagefolder", data_dir=folder_name) + dataset.push_to_hub("567-labs/psle-math")