From b896d0a3afb95825d94ce05feaf134406716167d Mon Sep 17 00:00:00 2001
From: Dan McPherson <dmcphers@redhat.com>
Date: Mon, 30 Sep 2024 20:27:41 -0400
Subject: [PATCH] Remove task logic with lm_eval 0.4.4 for agg_score

lm_eval used to return an extra entry that corresponded to the tasks requested. Ex: mmlu_pr.  As of 0.4.4 the entries are now the same whether the tasks are custom are not and the extra entry is removed.  So the agg score now needs to be calculated from the individual task scores returned so the logic can be shared with mmluevaluator.

Signed-off-by: Dan McPherson <dmcphers@redhat.com>
---
 src/instructlab/eval/mmlu.py | 95 ++++++++++++------------------------
 1 file changed, 31 insertions(+), 64 deletions(-)

diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
index 8a82322..6530497 100644
--- a/src/instructlab/eval/mmlu.py
+++ b/src/instructlab/eval/mmlu.py
@@ -122,6 +122,37 @@ def __init__(
         self.batch_size = batch_size
         self.device = device
 
+    def run(self, server_url: str | None = None) -> tuple:
+        """
+        Runs evaluation
+
+        Attributes
+            server_url          Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
+
+        Returns:
+            overall_score       Average score for the task group
+            individual_scores   Individual scores for each task in the task group
+        """
+        logger.debug(locals())
+
+        # TODO: make this a parameter for class?
+        os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+        individual_scores: dict = {}
+        agg_score: float = 0.0
+
+        results = self._run_mmlu(server_url)
+        for task, result in results.items():
+            agg_score += float(result["acc,none"])
+            individual_scores[task] = {
+                "score": float(result["acc,none"]),
+                "stderr": float(result["acc_stderr,none"]),
+            }
+
+        overall_score = float(agg_score / len(self.tasks))
+
+        return overall_score, individual_scores
+
     def _run_mmlu(self, server_url: str | None = None) -> dict:
         if server_url is not None:
             # Requires lm_eval >= 0.4.4
@@ -205,36 +236,6 @@ def __init__(
             model_path, None, tasks, model_dtype, few_shots, batch_size, device
         )
 
-    def run(self, server_url: str | None = None) -> tuple:
-        """
-        Runs MMLU evaluation
-
-        Attributes
-            server_url          Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
-
-        Returns:
-            overall_score       MMLU score for the overall model evaluation
-            individual_scores   Individual MMLU score for each task
-        """
-        logger.debug(locals())
-        # TODO: make this a parameter for class?
-        os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
-        individual_scores: dict = {}
-        agg_score: float = 0.0
-
-        results = self._run_mmlu(server_url)
-
-        for task in self.tasks:
-            mmlu_res = results[task]
-            agg_score += float(mmlu_res["acc,none"])
-            individual_scores[task] = {}
-            individual_scores[task]["score"] = float(mmlu_res["acc,none"])
-            individual_scores[task]["stderr"] = float(mmlu_res["acc_stderr,none"])
-
-        overall_score = float(agg_score / len(self.tasks))
-        return overall_score, individual_scores
-
 
 class MMLUBranchEvaluator(AbstractMMLUEvaluator):
     """
@@ -251,37 +252,3 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):
     """
 
     name = "mmlu_branch"
-
-    def run(self, server_url: str | None = None) -> tuple:
-        """
-        Runs MMLUBranch evaluation
-
-        Attributes
-            server_url          Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
-
-        Returns:
-            overall_score       Average MMLUBranch score for the task group
-            individual_scores   Individual MMLUBranch scores for each task in the task group
-        """
-        logger.debug(locals())
-
-        # TODO: make this a parameter for class?
-        os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
-        individual_scores: dict = {}
-        agg_score: float = 0.0
-
-        results = self._run_mmlu(server_url)
-
-        for task, result in results.items():
-            if task in self.tasks:
-                agg_score += float(result["acc,none"])
-            else:
-                individual_scores[task] = {
-                    "score": float(result["acc,none"]),
-                    "stderr": float(result["acc_stderr,none"]),
-                }
-
-        overall_score = float(agg_score / len(self.tasks))
-
-        return overall_score, individual_scores