diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py index 5c094c7..a9c52c6 100644 --- a/src/instructlab/eval/mt_bench_common.py +++ b/src/instructlab/eval/mt_bench_common.py @@ -87,32 +87,15 @@ def load_questions(question_file: str, begin: Optional[int], end: Optional[int]) return questions -def load_model_answers(answer_dir: str, model_name=None, answer_file=None) -> dict: - """Load model answers. +def load_model_answers(answer_file: str, model_name: str | None = None) -> dict: + """Load model answers from a single answer file The return value is a python dict of type: Dict[model_name: str -> Dict[question_id: int -> answer: dict]] """ logger.debug(locals()) - model_answers = {} - if answer_file is not None: - filename = os.path.basename(answer_file) - # Removing ".jsonl" - file_model_name = filename[:-6] - model_answers[file_model_name] = _load_answers(answer_file) - else: - for root, _, files in os.walk(answer_dir): - for filename in files: - if filename.endswith(".jsonl"): - # Removing ".jsonl" - file_model_name = filename[:-6] - file_path = os.path.join(root, filename) - model_answers[model_name or file_model_name] = _load_answers( - file_path - ) - if model_name == file_model_name: - logger.debug("Found answer file matching: %s", model_name) - break + file_model_name = os.path.splitext(os.path.basename(answer_file))[0] + model_answers = {model_name or file_model_name: _load_answers(answer_file)} return model_answers diff --git a/src/instructlab/eval/mt_bench_judgment.py b/src/instructlab/eval/mt_bench_judgment.py index 5dc4b84..0d24012 100644 --- a/src/instructlab/eval/mt_bench_judgment.py +++ b/src/instructlab/eval/mt_bench_judgment.py @@ -162,7 +162,9 @@ def judge_model( """Judge the model based on questions and reference answers""" logger.debug(locals()) package_data_dir = os.path.join(os.path.dirname(__file__), "data") + use_builtin_ref_answers = False if data_dir is None: + use_builtin_ref_answers = True data_dir = package_data_dir data_base_dir = bench_dir(data_dir, bench_name, branch) @@ -172,15 +174,19 @@ def judge_model( question_file = os.path.join(data_base_dir, "question.jsonl") answer_file = os.path.join(output_base_dir, "model_answer", f"{model_name}.jsonl") - answer_dir = os.path.dirname(answer_file) - ref_answer_dir = os.path.join(data_base_dir, "reference_answer") + if use_builtin_ref_answers: + ref_answer_file = os.path.join(data_base_dir, "reference_answer", "gpt-4.jsonl") + else: + ref_answer_file = os.path.join( + data_base_dir, "reference_answer", f"{judge_model_name}.jsonl" + ) # Load questions questions = load_questions(question_file, None, None) # Load answers - model_answers = load_model_answers(answer_dir, answer_file=answer_file) - ref_answers = load_model_answers(ref_answer_dir, judge_model_name) + model_answers = load_model_answers(answer_file) + ref_answers = load_model_answers(ref_answer_file, judge_model_name) # Load judge judge_prompts = load_judge_prompts(judge_file)