From 6db41b1430f54e2305320b60a4a3d7b8b7435435 Mon Sep 17 00:00:00 2001
From: Alina Ryan <aliryan@redhat.com>
Date: Tue, 18 Jun 2024 21:07:55 -0400
Subject: [PATCH 1/7] Introduce lm-evaluation-harness dependency

Signed-off-by: Alina Ryan <aliryan@redhat.com>
---
 requirements.txt             |  1 +
 src/instructlab/eval/mmlu.py | 37 ++++++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index bc02276..eaa9b0c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,4 @@ transformers
 accelerate
 pandas
 pandas-stubs
+lm-eval>=0.4.2
diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
index edeea0f..8776aac 100644
--- a/src/instructlab/eval/mmlu.py
+++ b/src/instructlab/eval/mmlu.py
@@ -3,6 +3,8 @@
 # Local
 from .evaluator import Evaluator
 
+# Third Party 
+from lm_eval.evaluator import simple_evaluate
 
 class MMLU_Evaluator(Evaluator):
     """
@@ -15,9 +17,11 @@ class MMLU_Evaluator(Evaluator):
     """
 
     def __init__(
-        self, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5
+        self, model, model_args, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5
     ) -> None:
-        self.model_path = model_path
+        super().__init__(model_path)
+        self.model = model
+        self.model_args = model_args
         self.tasks = tasks
         self.few_shots = few_shots
         self.batch_size = batch_size
@@ -32,8 +36,37 @@ def run(self) -> tuple:
         """
         individual_scores: dict[str, float] = {}
         overall_score: float = 0.0
+        results = lm_eval.simple_evaluate(
+            model=self.model,
+            model_args=self.model_args,
+            tasks=self.tasks,
+            num_fewshot=self.few_shots,
+            batch_size=self.batch_size,
+            log_samples=True,
+        )
+        #TODO: see what the output of results looks like 
+        #print(results)
+        #calculate_overall_score(results)
         return overall_score, individual_scores
+    
+    def calculate_overall_score(scores):
+        pass # Placeholder for calculating overall score:
+             # overall score = (num model answered correctly / num questions)
+
+############# Testing Code Follows ##############
+def main():
+    # TODO: change this- cli uses HuggingFace to access the model 
+    model = "hf"
+    model_args = "pretrained=$MODEL_PATH,dtype=bfloat16"
+    # Path to the granite model in the aliryan vm on AWS
+    model_path = "/home/ec2-user/instructlab/models/instructlab/granite-7b-lab"
+    #TODO: all 57 tasks need to be parameterized possibly by CLI
+    tasks = "mmlu_abstract_algebra"
+    mmlu = MMLU_Evaluator(model, model_args, model_path, tasks, 2, 5)
 
+if __name__ == "__main__":
+    main()
+############# Testing Code Ends ##############
 
 class PR_MMLU_Evaluator(Evaluator):
     """

From 8e024b80522eae080bbdbc0c5922540c232a146d Mon Sep 17 00:00:00 2001
From: Ali Maredia <amaredia@redhat.com>
Date: Thu, 20 Jun 2024 11:07:41 +0000
Subject: [PATCH 2/7] working MMLU_Evaluator.run()

Replicates functionality in backend evaluation
code for mmlu.

Model that is tested is served my lm-eval code
internally.

Signed-off-by: Ali Maredia <amaredia@redhat.com>
---
 src/instructlab/eval/mmlu.py | 56 ++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
index 8776aac..b06084c 100644
--- a/src/instructlab/eval/mmlu.py
+++ b/src/instructlab/eval/mmlu.py
@@ -1,28 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Local
-from .evaluator import Evaluator
+from instructlab.eval.evaluator import Evaluator
 
 # Third Party 
 from lm_eval.evaluator import simple_evaluate
+import os
 
 class MMLU_Evaluator(Evaluator):
     """
     Child class of an Evaluator for Massive Multitask Language Understanding (MMLU)
 
     Attributes:
+        model_path   absolute path to or name of a huggingface model
         tasks        list of tasks for MMLU to test the model with
+        model_dtype  dtype of model when served
         few_shots    number of examples
         batch_size   number of GPUs
     """
 
     def __init__(
-        self, model, model_args, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5
+        self, model_path, tasks: list[str], model_dtype = 'bfloat16', few_shots: int = 2, batch_size: int = 5
     ) -> None:
         super().__init__(model_path)
-        self.model = model
-        self.model_args = model_args
         self.tasks = tasks
+        self.model_dtype = model_dtype
         self.few_shots = few_shots
         self.batch_size = batch_size
 
@@ -34,35 +36,45 @@ def run(self) -> tuple:
             overall_score       MMLU score for the overall model evaluation
             individual_scores   Individual MMLU score for each task
         """
-        individual_scores: dict[str, float] = {}
-        overall_score: float = 0.0
-        results = lm_eval.simple_evaluate(
-            model=self.model,
-            model_args=self.model_args,
+        #TODO: make this a parameter for class?
+        os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+
+        individual_scores: dict = {}
+        agg_score: float = 0.0
+        model_args = "pretrained=" + self.model_path + ",dtype=" + self.model_dtype
+
+        mmlu_output = simple_evaluate(
+            model="hf",
+            model_args=model_args,
             tasks=self.tasks,
             num_fewshot=self.few_shots,
-            batch_size=self.batch_size,
-            log_samples=True,
+            batch_size=self.batch_size
         )
-        #TODO: see what the output of results looks like 
-        #print(results)
-        #calculate_overall_score(results)
+
+        results = mmlu_output["results"]
+
+        for task in self.tasks:
+            mmlu_res = results[task]
+            agg_score += float(mmlu_res['acc,none'])
+            individual_scores[task] = {}
+            individual_scores[task]['score'] = float(mmlu_res['acc,none'])
+            individual_scores[task]['stderr'] = float(mmlu_res['acc_stderr,none'])
+
+        overall_score = float(agg_score/len(self.tasks))
         return overall_score, individual_scores
     
-    def calculate_overall_score(scores):
-        pass # Placeholder for calculating overall score:
-             # overall score = (num model answered correctly / num questions)
 
 ############# Testing Code Follows ##############
 def main():
-    # TODO: change this- cli uses HuggingFace to access the model 
-    model = "hf"
-    model_args = "pretrained=$MODEL_PATH,dtype=bfloat16"
     # Path to the granite model in the aliryan vm on AWS
     model_path = "/home/ec2-user/instructlab/models/instructlab/granite-7b-lab"
     #TODO: all 57 tasks need to be parameterized possibly by CLI
-    tasks = "mmlu_abstract_algebra"
-    mmlu = MMLU_Evaluator(model, model_args, model_path, tasks, 2, 5)
+    tasks = ["mmlu_abstract_algebra","mmlu_anatomy","mmlu_astronomy"]
+    dtype = "float16"
+    mmlu = MMLU_Evaluator(model_path, tasks, dtype, 2, 5)
+    overall_score, individual_scores = mmlu.run()
+    print(overall_score)
+    print(individual_scores)
 
 if __name__ == "__main__":
     main()

From 05233b4688de4de0ddcd9e248766bd391a23a8a3 Mon Sep 17 00:00:00 2001
From: Alina Ryan <aliryan@redhat.com>
Date: Tue, 25 Jun 2024 11:23:20 -0400
Subject: [PATCH 3/7] Remove testing code and change the MMLU class names and
 descriptions

Signed-off-by: Alina Ryan <aliryan@redhat.com>
---
 src/instructlab/eval/mmlu.py | 35 +++++++++--------------------------
 1 file changed, 9 insertions(+), 26 deletions(-)

diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
index b06084c..4eb767e 100644
--- a/src/instructlab/eval/mmlu.py
+++ b/src/instructlab/eval/mmlu.py
@@ -7,7 +7,7 @@
 from lm_eval.evaluator import simple_evaluate
 import os
 
-class MMLU_Evaluator(Evaluator):
+class MMLUEvaluator(Evaluator):
     """
     Child class of an Evaluator for Massive Multitask Language Understanding (MMLU)
 
@@ -62,31 +62,14 @@ def run(self) -> tuple:
 
         overall_score = float(agg_score/len(self.tasks))
         return overall_score, individual_scores
-    
-
-############# Testing Code Follows ##############
-def main():
-    # Path to the granite model in the aliryan vm on AWS
-    model_path = "/home/ec2-user/instructlab/models/instructlab/granite-7b-lab"
-    #TODO: all 57 tasks need to be parameterized possibly by CLI
-    tasks = ["mmlu_abstract_algebra","mmlu_anatomy","mmlu_astronomy"]
-    dtype = "float16"
-    mmlu = MMLU_Evaluator(model_path, tasks, dtype, 2, 5)
-    overall_score, individual_scores = mmlu.run()
-    print(overall_score)
-    print(individual_scores)
-
-if __name__ == "__main__":
-    main()
-############# Testing Code Ends ##############
-
-class PR_MMLU_Evaluator(Evaluator):
+
+class MMLUBranchEvaluator(Evaluator):
     """
-    Child class of an Evaluator for PR Massive Multitask Language Understanding (PR MMLU)
+    Child class of an Evaluator for Massive Multitask Language Understanding Branch (MMLUBranch)
 
     Attributes:
-        sdg_path    path where all the PR MMLU tasks are stored
-        task        group name that is shared by all the PR MMLU tasks
+        sdg_path    path where all the MMLUBranch tasks are stored
+        task        group name that is shared by all the MMLUBranch tasks
         few_shots   number of examples
         batch_size  number of GPUs
     """
@@ -107,11 +90,11 @@ def __init__(
 
     def run(self) -> tuple:
         """
-        Runs PR MMLU evaluation
+        Runs MMLUBranch evaluation
 
         Returns:
-            overall_score       PR MMLU score for the overall model evaluation
-            individual_scores   Individual PR MMLU scores for each task
+            overall_score       MMLUBranch score for the overall model evaluation
+            individual_scores   Individual MMLUBranch scores for each task
             qa_pairs            Question and answer pairs from the evaluation
         """
         individual_scores: dict[str, float] = {}

From 2e607f5a47a6442d74d7ccbeb86b8e6f4eb1f2c2 Mon Sep 17 00:00:00 2001
From: Alina Ryan <aliryan@redhat.com>
Date: Tue, 25 Jun 2024 13:41:48 -0400
Subject: [PATCH 4/7] Add missing model_path param and description

Signed-off-by: Alina Ryan <aliryan@redhat.com>
---
 src/instructlab/eval/mmlu.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
index 4eb767e..244e647 100644
--- a/src/instructlab/eval/mmlu.py
+++ b/src/instructlab/eval/mmlu.py
@@ -27,6 +27,7 @@ def __init__(
         self.model_dtype = model_dtype
         self.few_shots = few_shots
         self.batch_size = batch_size
+        self.model_path = model_path
 
     def run(self) -> tuple:
         """
@@ -68,6 +69,7 @@ class MMLUBranchEvaluator(Evaluator):
     Child class of an Evaluator for Massive Multitask Language Understanding Branch (MMLUBranch)
 
     Attributes:
+        model_path  absolute path to or name of a huggingface model
         sdg_path    path where all the MMLUBranch tasks are stored
         task        group name that is shared by all the MMLUBranch tasks
         few_shots   number of examples
@@ -82,6 +84,7 @@ def __init__(
         few_shots: int = 2,
         batch_size: int = 5,
     ) -> None:
+        super().__init__()
         self.model_path = model_path
         self.sdg_path = sdg_path
         self.task = task

From 270bf538d49e67919f3ddbe1efd664dd5bd0d23f Mon Sep 17 00:00:00 2001
From: Alina Ryan <aliryan@redhat.com>
Date: Tue, 25 Jun 2024 14:19:29 -0400
Subject: [PATCH 5/7] Fix lint errors

Signed-off-by: Alina Ryan <aliryan@redhat.com>
---
 src/instructlab/eval/mmlu.py | 38 ++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
index 244e647..c092364 100644
--- a/src/instructlab/eval/mmlu.py
+++ b/src/instructlab/eval/mmlu.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
-# Local
-from instructlab.eval.evaluator import Evaluator
+# Standard
+import os
 
-# Third Party 
+# Third Party
 from lm_eval.evaluator import simple_evaluate
-import os
+
+# First Party
+from instructlab.eval.evaluator import Evaluator
+
 
 class MMLUEvaluator(Evaluator):
     """
@@ -20,14 +23,19 @@ class MMLUEvaluator(Evaluator):
     """
 
     def __init__(
-        self, model_path, tasks: list[str], model_dtype = 'bfloat16', few_shots: int = 2, batch_size: int = 5
+        self,
+        model_path,
+        tasks: list[str],
+        model_dtype="bfloat16",
+        few_shots: int = 2,
+        batch_size: int = 5,
     ) -> None:
-        super().__init__(model_path)
+        super().__init__()
+        self.model_path = model_path
         self.tasks = tasks
         self.model_dtype = model_dtype
         self.few_shots = few_shots
         self.batch_size = batch_size
-        self.model_path = model_path
 
     def run(self) -> tuple:
         """
@@ -37,8 +45,8 @@ def run(self) -> tuple:
             overall_score       MMLU score for the overall model evaluation
             individual_scores   Individual MMLU score for each task
         """
-        #TODO: make this a parameter for class?
-        os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+        # TODO: make this a parameter for class?
+        os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
         individual_scores: dict = {}
         agg_score: float = 0.0
@@ -49,21 +57,22 @@ def run(self) -> tuple:
             model_args=model_args,
             tasks=self.tasks,
             num_fewshot=self.few_shots,
-            batch_size=self.batch_size
+            batch_size=self.batch_size,
         )
 
         results = mmlu_output["results"]
 
         for task in self.tasks:
             mmlu_res = results[task]
-            agg_score += float(mmlu_res['acc,none'])
+            agg_score += float(mmlu_res["acc,none"])
             individual_scores[task] = {}
-            individual_scores[task]['score'] = float(mmlu_res['acc,none'])
-            individual_scores[task]['stderr'] = float(mmlu_res['acc_stderr,none'])
+            individual_scores[task]["score"] = float(mmlu_res["acc,none"])
+            individual_scores[task]["stderr"] = float(mmlu_res["acc_stderr,none"])
 
-        overall_score = float(agg_score/len(self.tasks))
+        overall_score = float(agg_score / len(self.tasks))
         return overall_score, individual_scores
 
+
 class MMLUBranchEvaluator(Evaluator):
     """
     Child class of an Evaluator for Massive Multitask Language Understanding Branch (MMLUBranch)
@@ -84,7 +93,6 @@ def __init__(
         few_shots: int = 2,
         batch_size: int = 5,
     ) -> None:
-        super().__init__()
         self.model_path = model_path
         self.sdg_path = sdg_path
         self.task = task

From 88b2e658252821f155a779d9387d996707644caf Mon Sep 17 00:00:00 2001
From: Alina Ryan <aliryan@redhat.com>
Date: Tue, 25 Jun 2024 16:25:30 -0400
Subject: [PATCH 6/7] suppress import err

Signed-off-by: Alina Ryan <aliryan@redhat.com>
---
 src/instructlab/eval/mmlu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
index c092364..7d96855 100644
--- a/src/instructlab/eval/mmlu.py
+++ b/src/instructlab/eval/mmlu.py
@@ -4,7 +4,7 @@
 import os
 
 # Third Party
-from lm_eval.evaluator import simple_evaluate
+from lm_eval.evaluator import simple_evaluate  # type: ignore
 
 # First Party
 from instructlab.eval.evaluator import Evaluator

From 899aaf9f06a7376fef03d02fd802a5174de2fb20 Mon Sep 17 00:00:00 2001
From: Alina Ryan <aliryan@redhat.com>
Date: Tue, 25 Jun 2024 17:07:21 -0400
Subject: [PATCH 7/7] Add fstring

Signed-off-by: Alina Ryan <aliryan@redhat.com>
---
 src/instructlab/eval/mmlu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
index 7d96855..b95f476 100644
--- a/src/instructlab/eval/mmlu.py
+++ b/src/instructlab/eval/mmlu.py
@@ -50,7 +50,7 @@ def run(self) -> tuple:
 
         individual_scores: dict = {}
         agg_score: float = 0.0
-        model_args = "pretrained=" + self.model_path + ",dtype=" + self.model_dtype
+        model_args = f"pretrained= {self.model_path}, dtype= {self.model_dtype}"
 
         mmlu_output = simple_evaluate(
             model="hf",