From 5f669ab63e8b8dc81a4198ee3424492a7044a59c Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Fri, 10 Jan 2025 18:44:28 +0800 Subject: [PATCH 1/4] Update optillm.py --- optillm.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/optillm.py b/optillm.py index 0fba254..b504626 100644 --- a/optillm.py +++ b/optillm.py @@ -416,16 +416,23 @@ def parse_conversation(messages): def tagged_conversation_to_messages(response_text): """Convert a tagged conversation string or list of strings into a list of messages. + If the input doesn't contain User:/Assistant: tags, return it as is. Args: response_text: Either a string containing "User:" and "Assistant:" tags, or a list of such strings. Returns: - If input is a string: A list of message dictionaries. - If input is a list: A list of lists of message dictionaries. + If input has tags: A list of message dictionaries. + If input has no tags: The original input. """ + def has_conversation_tags(text): + return "User:" in text or "Assistant:" in text + def process_single_response(text): + if not has_conversation_tags(text): + return text + messages = [] # Split on "User:" or "Assistant:" while keeping the delimiter parts = re.split(r'(?=(User:|Assistant:))', text.strip()) @@ -447,7 +454,11 @@ def process_single_response(text): return messages if isinstance(response_text, list): - return [process_single_response(text) for text in response_text] + processed = [process_single_response(text) for text in response_text] + # If none of the responses had tags, return original list + if all(isinstance(p, str) for p in processed): + return response_text + return processed else: return process_single_response(response_text) @@ -555,14 +566,18 @@ def proxy(): except Exception as e: logger.error(f"Error processing request: {str(e)}") return jsonify({"error": str(e)}), 500 - + # Convert tagged conversation to messages format if needed if isinstance(response, list): - response = [msg[-1]['content'] if isinstance(msg, list) and msg else msg - for msg in tagged_conversation_to_messages(response)] + processed_response = tagged_conversation_to_messages(response) + # If processed_response is a list of message lists, extract last message content + if processed_response != response: # Only process if format changed + response = [msg[-1]['content'] if isinstance(msg, list) and msg else msg + for msg in processed_response] + # Otherwise keep original response else: messages = tagged_conversation_to_messages(response) - if messages: # Only take the last message if we have any + if isinstance(messages, list) and messages: # Only process if format changed response = messages[-1]['content'] if stream: From 6f114b93e0f7943491a7f68313e717421aba7938 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Fri, 10 Jan 2025 18:50:47 +0800 Subject: [PATCH 2/4] Update eval_aime_benchmark.py - handle multiple responses as multiple attempts --- scripts/eval_aime_benchmark.py | 53 ++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/scripts/eval_aime_benchmark.py b/scripts/eval_aime_benchmark.py index 84a3166..343a510 100644 --- a/scripts/eval_aime_benchmark.py +++ b/scripts/eval_aime_benchmark.py @@ -4,7 +4,7 @@ import logging import re import time -from typing import List, Dict, Tuple, Optional +from typing import List, Dict, Tuple, Optional, Union from datetime import datetime from openai import OpenAI from datasets import load_dataset @@ -89,9 +89,17 @@ def extract_answer(response: str) -> Optional[int]: return None -def get_llm_response(problem: str, model: str) -> str: +def get_llm_response(problem: str, model: str) -> Union[str, List[Dict]]: """ Get response from the LLM for a given problem. + If multiple choices are returned, formats them as attempt dictionaries. + + Args: + problem (str): The problem text + model (str): The model identifier + + Returns: + Union[str, List[Dict]]: Either a string response or list of attempt dictionaries """ try: response = client.with_options(timeout=1000.0).chat.completions.create( @@ -101,7 +109,23 @@ def get_llm_response(problem: str, model: str) -> str: ], max_tokens=8192, ) + + # If there's more than one choice, format as attempts + if len(response.choices) > 1: + attempts = [] + for i, choice in enumerate(response.choices): + response_text = choice.message.content.strip() + predicted_answer = extract_answer(response_text) + attempts.append({ + "attempt_number": i + 1, + "response": response_text, + "predicted_answer": predicted_answer + }) + return attempts + + # If single choice, return as before return response.choices[0].message.content.strip() + except Exception as e: logger.error(f"Error getting LLM response: {e}") return "" @@ -119,14 +143,25 @@ def make_n_attempts(problem: str, model: str, n: int) -> List[Dict]: List[Dict]: List of dictionaries containing response and predicted answer for each attempt """ attempts = [] - for i in range(n): + remaining_attempts = n + + while remaining_attempts > 0: response = get_llm_response(problem, model) - predicted_answer = extract_answer(response) - attempts.append({ - "attempt_number": i + 1, - "response": response, - "predicted_answer": predicted_answer - }) + + # If response is already formatted as attempts + if isinstance(response, list): + attempts.extend(response[:remaining_attempts]) # Only take what we need + remaining_attempts -= len(response) + else: + # Process single response as before + predicted_answer = extract_answer(response) + attempts.append({ + "attempt_number": len(attempts) + 1, + "response": response, + "predicted_answer": predicted_answer + }) + remaining_attempts -= 1 + return attempts def evaluate_pass_at_n(attempts: List[Dict], correct_answer: int) -> Tuple[bool, Optional[int]]: From 92798cb3ef0cda0d898e34798ce5e9a530545c57 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Fri, 10 Jan 2025 19:27:29 +0800 Subject: [PATCH 3/4] Update eval_aime_benchmark.py --- scripts/eval_aime_benchmark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/eval_aime_benchmark.py b/scripts/eval_aime_benchmark.py index 343a510..c834a2f 100644 --- a/scripts/eval_aime_benchmark.py +++ b/scripts/eval_aime_benchmark.py @@ -150,8 +150,8 @@ def make_n_attempts(problem: str, model: str, n: int) -> List[Dict]: # If response is already formatted as attempts if isinstance(response, list): - attempts.extend(response[:remaining_attempts]) # Only take what we need - remaining_attempts -= len(response) + attempts.extend(response) + remaining_attempts = n - len(attempts) else: # Process single response as before predicted_answer = extract_answer(response) From f96b4350ad0f75955169746fc47f5a7051408214 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Sat, 11 Jan 2025 07:53:05 +0800 Subject: [PATCH 4/4] Update setup.py - prepare for new release --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2b98a20..25068c2 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="optillm", - version="0.0.23", + version="0.0.24", packages=find_packages(), py_modules=['optillm'], package_data={