From 5f669ab63e8b8dc81a4198ee3424492a7044a59c Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Fri, 10 Jan 2025 18:44:28 +0800
Subject: [PATCH 1/4] Update optillm.py

---
 optillm.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/optillm.py b/optillm.py
index 0fba254..b504626 100644
--- a/optillm.py
+++ b/optillm.py
@@ -416,16 +416,23 @@ def parse_conversation(messages):
 
 def tagged_conversation_to_messages(response_text):
     """Convert a tagged conversation string or list of strings into a list of messages.
+    If the input doesn't contain User:/Assistant: tags, return it as is.
     
     Args:
         response_text: Either a string containing "User:" and "Assistant:" tags,
                       or a list of such strings.
     
     Returns:
-        If input is a string: A list of message dictionaries.
-        If input is a list: A list of lists of message dictionaries.
+        If input has tags: A list of message dictionaries.
+        If input has no tags: The original input.
     """
+    def has_conversation_tags(text):
+        return "User:" in text or "Assistant:" in text
+    
     def process_single_response(text):
+        if not has_conversation_tags(text):
+            return text
+            
         messages = []
         # Split on "User:" or "Assistant:" while keeping the delimiter
         parts = re.split(r'(?=(User:|Assistant:))', text.strip())
@@ -447,7 +454,11 @@ def process_single_response(text):
         return messages
 
     if isinstance(response_text, list):
-        return [process_single_response(text) for text in response_text]
+        processed = [process_single_response(text) for text in response_text]
+        # If none of the responses had tags, return original list
+        if all(isinstance(p, str) for p in processed):
+            return response_text
+        return processed
     else:
         return process_single_response(response_text)
 
@@ -555,14 +566,18 @@ def proxy():
     except Exception as e:
         logger.error(f"Error processing request: {str(e)}")
         return jsonify({"error": str(e)}), 500
-    
+
     # Convert tagged conversation to messages format if needed
     if isinstance(response, list):
-        response = [msg[-1]['content'] if isinstance(msg, list) and msg else msg 
-                   for msg in tagged_conversation_to_messages(response)]
+        processed_response = tagged_conversation_to_messages(response)
+        # If processed_response is a list of message lists, extract last message content
+        if processed_response != response:  # Only process if format changed
+            response = [msg[-1]['content'] if isinstance(msg, list) and msg else msg 
+                    for msg in processed_response]
+        # Otherwise keep original response
     else:
         messages = tagged_conversation_to_messages(response)
-        if messages:  # Only take the last message if we have any
+        if isinstance(messages, list) and messages:  # Only process if format changed
             response = messages[-1]['content']
 
     if stream:

From 6f114b93e0f7943491a7f68313e717421aba7938 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Fri, 10 Jan 2025 18:50:47 +0800
Subject: [PATCH 2/4] Update eval_aime_benchmark.py

- handle multiple responses as multiple attempts
---
 scripts/eval_aime_benchmark.py | 53 ++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 9 deletions(-)

diff --git a/scripts/eval_aime_benchmark.py b/scripts/eval_aime_benchmark.py
index 84a3166..343a510 100644
--- a/scripts/eval_aime_benchmark.py
+++ b/scripts/eval_aime_benchmark.py
@@ -4,7 +4,7 @@
 import logging
 import re
 import time
-from typing import List, Dict, Tuple, Optional
+from typing import List, Dict, Tuple, Optional, Union
 from datetime import datetime
 from openai import OpenAI
 from datasets import load_dataset
@@ -89,9 +89,17 @@ def extract_answer(response: str) -> Optional[int]:
             
     return None
 
-def get_llm_response(problem: str, model: str) -> str:
+def get_llm_response(problem: str, model: str) -> Union[str, List[Dict]]:
     """
     Get response from the LLM for a given problem.
+    If multiple choices are returned, formats them as attempt dictionaries.
+    
+    Args:
+        problem (str): The problem text
+        model (str): The model identifier
+        
+    Returns:
+        Union[str, List[Dict]]: Either a string response or list of attempt dictionaries
     """
     try:
         response = client.with_options(timeout=1000.0).chat.completions.create(
@@ -101,7 +109,23 @@ def get_llm_response(problem: str, model: str) -> str:
             ],
             max_tokens=8192,
         )
+        
+        # If there's more than one choice, format as attempts
+        if len(response.choices) > 1:
+            attempts = []
+            for i, choice in enumerate(response.choices):
+                response_text = choice.message.content.strip()
+                predicted_answer = extract_answer(response_text)
+                attempts.append({
+                    "attempt_number": i + 1,
+                    "response": response_text,
+                    "predicted_answer": predicted_answer
+                })
+            return attempts
+            
+        # If single choice, return as before
         return response.choices[0].message.content.strip()
+        
     except Exception as e:
         logger.error(f"Error getting LLM response: {e}")
         return ""
@@ -119,14 +143,25 @@ def make_n_attempts(problem: str, model: str, n: int) -> List[Dict]:
         List[Dict]: List of dictionaries containing response and predicted answer for each attempt
     """
     attempts = []
-    for i in range(n):
+    remaining_attempts = n
+    
+    while remaining_attempts > 0:
         response = get_llm_response(problem, model)
-        predicted_answer = extract_answer(response)
-        attempts.append({
-            "attempt_number": i + 1,
-            "response": response,
-            "predicted_answer": predicted_answer
-        })
+        
+        # If response is already formatted as attempts
+        if isinstance(response, list):
+            attempts.extend(response[:remaining_attempts])  # Only take what we need
+            remaining_attempts -= len(response)
+        else:
+            # Process single response as before
+            predicted_answer = extract_answer(response)
+            attempts.append({
+                "attempt_number": len(attempts) + 1,
+                "response": response,
+                "predicted_answer": predicted_answer
+            })
+            remaining_attempts -= 1
+    
     return attempts
 
 def evaluate_pass_at_n(attempts: List[Dict], correct_answer: int) -> Tuple[bool, Optional[int]]:

From 92798cb3ef0cda0d898e34798ce5e9a530545c57 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Fri, 10 Jan 2025 19:27:29 +0800
Subject: [PATCH 3/4] Update eval_aime_benchmark.py

---
 scripts/eval_aime_benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/eval_aime_benchmark.py b/scripts/eval_aime_benchmark.py
index 343a510..c834a2f 100644
--- a/scripts/eval_aime_benchmark.py
+++ b/scripts/eval_aime_benchmark.py
@@ -150,8 +150,8 @@ def make_n_attempts(problem: str, model: str, n: int) -> List[Dict]:
         
         # If response is already formatted as attempts
         if isinstance(response, list):
-            attempts.extend(response[:remaining_attempts])  # Only take what we need
-            remaining_attempts -= len(response)
+            attempts.extend(response)
+            remaining_attempts = n - len(attempts)
         else:
             # Process single response as before
             predicted_answer = extract_answer(response)

From f96b4350ad0f75955169746fc47f5a7051408214 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sat, 11 Jan 2025 07:53:05 +0800
Subject: [PATCH 4/4] Update setup.py

- prepare for new release
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2b98a20..25068c2 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="optillm",
-    version="0.0.23",
+    version="0.0.24",
     packages=find_packages(),
     py_modules=['optillm'],
     package_data={