From fc045849309ca2c4d1e3e641d9261570c9e43147 Mon Sep 17 00:00:00 2001
From: Dan McPherson <dmcphers@redhat.com>
Date: Wed, 28 Aug 2024 12:57:35 -0400
Subject: [PATCH] Cleanup usage of load model answers

This logic removes the directory walking logic of load_model_answers in favor of finding the exact file needed.  Since we are no longer attempting to support multiple models within a single eval, this simplification is possible and avoids some confusing logic that was in place to allow any judge model to be passed but still use the builtin reference answers.

Signed-off-by: Dan McPherson <dmcphers@redhat.com>
---
 src/instructlab/eval/mt_bench_common.py   | 25 ++++-------------------
 src/instructlab/eval/mt_bench_judgment.py | 14 +++++++++----
 2 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py
index 5c094c7..a9c52c6 100644
--- a/src/instructlab/eval/mt_bench_common.py
+++ b/src/instructlab/eval/mt_bench_common.py
@@ -87,32 +87,15 @@ def load_questions(question_file: str, begin: Optional[int], end: Optional[int])
     return questions
 
 
-def load_model_answers(answer_dir: str, model_name=None, answer_file=None) -> dict:
-    """Load model answers.
+def load_model_answers(answer_file: str, model_name: str | None = None) -> dict:
+    """Load model answers from a single answer file
 
     The return value is a python dict of type:
     Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
     """
     logger.debug(locals())
-    model_answers = {}
-    if answer_file is not None:
-        filename = os.path.basename(answer_file)
-        # Removing ".jsonl"
-        file_model_name = filename[:-6]
-        model_answers[file_model_name] = _load_answers(answer_file)
-    else:
-        for root, _, files in os.walk(answer_dir):
-            for filename in files:
-                if filename.endswith(".jsonl"):
-                    # Removing ".jsonl"
-                    file_model_name = filename[:-6]
-                    file_path = os.path.join(root, filename)
-                    model_answers[model_name or file_model_name] = _load_answers(
-                        file_path
-                    )
-                    if model_name == file_model_name:
-                        logger.debug("Found answer file matching: %s", model_name)
-                        break
+    file_model_name = os.path.splitext(os.path.basename(answer_file))[0]
+    model_answers = {model_name or file_model_name: _load_answers(answer_file)}
     return model_answers
 
 
diff --git a/src/instructlab/eval/mt_bench_judgment.py b/src/instructlab/eval/mt_bench_judgment.py
index 5dc4b84..0d24012 100644
--- a/src/instructlab/eval/mt_bench_judgment.py
+++ b/src/instructlab/eval/mt_bench_judgment.py
@@ -162,7 +162,9 @@ def judge_model(
     """Judge the model based on questions and reference answers"""
     logger.debug(locals())
     package_data_dir = os.path.join(os.path.dirname(__file__), "data")
+    use_builtin_ref_answers = False
     if data_dir is None:
+        use_builtin_ref_answers = True
         data_dir = package_data_dir
 
     data_base_dir = bench_dir(data_dir, bench_name, branch)
@@ -172,15 +174,19 @@ def judge_model(
 
     question_file = os.path.join(data_base_dir, "question.jsonl")
     answer_file = os.path.join(output_base_dir, "model_answer", f"{model_name}.jsonl")
-    answer_dir = os.path.dirname(answer_file)
-    ref_answer_dir = os.path.join(data_base_dir, "reference_answer")
+    if use_builtin_ref_answers:
+        ref_answer_file = os.path.join(data_base_dir, "reference_answer", "gpt-4.jsonl")
+    else:
+        ref_answer_file = os.path.join(
+            data_base_dir, "reference_answer", f"{judge_model_name}.jsonl"
+        )
 
     # Load questions
     questions = load_questions(question_file, None, None)
 
     # Load answers
-    model_answers = load_model_answers(answer_dir, answer_file=answer_file)
-    ref_answers = load_model_answers(ref_answer_dir, judge_model_name)
+    model_answers = load_model_answers(answer_file)
+    ref_answers = load_model_answers(ref_answer_file, judge_model_name)
 
     # Load judge
     judge_prompts = load_judge_prompts(judge_file)