From 6db41b1430f54e2305320b60a4a3d7b8b7435435 Mon Sep 17 00:00:00 2001 From: Alina Ryan Date: Tue, 18 Jun 2024 21:07:55 -0400 Subject: [PATCH 1/7] Introduce lm-evaluation-harness dependency Signed-off-by: Alina Ryan --- requirements.txt | 1 + src/instructlab/eval/mmlu.py | 37 ++++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index bc02276..eaa9b0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ transformers accelerate pandas pandas-stubs +lm-eval>=0.4.2 diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index edeea0f..8776aac 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -3,6 +3,8 @@ # Local from .evaluator import Evaluator +# Third Party +from lm_eval.evaluator import simple_evaluate class MMLU_Evaluator(Evaluator): """ @@ -15,9 +17,11 @@ class MMLU_Evaluator(Evaluator): """ def __init__( - self, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5 + self, model, model_args, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5 ) -> None: - self.model_path = model_path + super().__init__(model_path) + self.model = model + self.model_args = model_args self.tasks = tasks self.few_shots = few_shots self.batch_size = batch_size @@ -32,8 +36,37 @@ def run(self) -> tuple: """ individual_scores: dict[str, float] = {} overall_score: float = 0.0 + results = lm_eval.simple_evaluate( + model=self.model, + model_args=self.model_args, + tasks=self.tasks, + num_fewshot=self.few_shots, + batch_size=self.batch_size, + log_samples=True, + ) + #TODO: see what the output of results looks like + #print(results) + #calculate_overall_score(results) return overall_score, individual_scores + + def calculate_overall_score(scores): + pass # Placeholder for calculating overall score: + # overall score = (num model answered correctly / num questions) + +############# Testing Code Follows ############## +def main(): + # TODO: change this- cli uses HuggingFace to access the model + model = "hf" + model_args = "pretrained=$MODEL_PATH,dtype=bfloat16" + # Path to the granite model in the aliryan vm on AWS + model_path = "/home/ec2-user/instructlab/models/instructlab/granite-7b-lab" + #TODO: all 57 tasks need to be parameterized possibly by CLI + tasks = "mmlu_abstract_algebra" + mmlu = MMLU_Evaluator(model, model_args, model_path, tasks, 2, 5) +if __name__ == "__main__": + main() +############# Testing Code Ends ############## class PR_MMLU_Evaluator(Evaluator): """ From 8e024b80522eae080bbdbc0c5922540c232a146d Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Thu, 20 Jun 2024 11:07:41 +0000 Subject: [PATCH 2/7] working MMLU_Evaluator.run() Replicates functionality in backend evaluation code for mmlu. Model that is tested is served my lm-eval code internally. Signed-off-by: Ali Maredia --- src/instructlab/eval/mmlu.py | 56 ++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index 8776aac..b06084c 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -1,28 +1,30 @@ # SPDX-License-Identifier: Apache-2.0 # Local -from .evaluator import Evaluator +from instructlab.eval.evaluator import Evaluator # Third Party from lm_eval.evaluator import simple_evaluate +import os class MMLU_Evaluator(Evaluator): """ Child class of an Evaluator for Massive Multitask Language Understanding (MMLU) Attributes: + model_path absolute path to or name of a huggingface model tasks list of tasks for MMLU to test the model with + model_dtype dtype of model when served few_shots number of examples batch_size number of GPUs """ def __init__( - self, model, model_args, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5 + self, model_path, tasks: list[str], model_dtype = 'bfloat16', few_shots: int = 2, batch_size: int = 5 ) -> None: super().__init__(model_path) - self.model = model - self.model_args = model_args self.tasks = tasks + self.model_dtype = model_dtype self.few_shots = few_shots self.batch_size = batch_size @@ -34,35 +36,45 @@ def run(self) -> tuple: overall_score MMLU score for the overall model evaluation individual_scores Individual MMLU score for each task """ - individual_scores: dict[str, float] = {} - overall_score: float = 0.0 - results = lm_eval.simple_evaluate( - model=self.model, - model_args=self.model_args, + #TODO: make this a parameter for class? + os.environ['TOKENIZERS_PARALLELISM'] = 'true' + + individual_scores: dict = {} + agg_score: float = 0.0 + model_args = "pretrained=" + self.model_path + ",dtype=" + self.model_dtype + + mmlu_output = simple_evaluate( + model="hf", + model_args=model_args, tasks=self.tasks, num_fewshot=self.few_shots, - batch_size=self.batch_size, - log_samples=True, + batch_size=self.batch_size ) - #TODO: see what the output of results looks like - #print(results) - #calculate_overall_score(results) + + results = mmlu_output["results"] + + for task in self.tasks: + mmlu_res = results[task] + agg_score += float(mmlu_res['acc,none']) + individual_scores[task] = {} + individual_scores[task]['score'] = float(mmlu_res['acc,none']) + individual_scores[task]['stderr'] = float(mmlu_res['acc_stderr,none']) + + overall_score = float(agg_score/len(self.tasks)) return overall_score, individual_scores - def calculate_overall_score(scores): - pass # Placeholder for calculating overall score: - # overall score = (num model answered correctly / num questions) ############# Testing Code Follows ############## def main(): - # TODO: change this- cli uses HuggingFace to access the model - model = "hf" - model_args = "pretrained=$MODEL_PATH,dtype=bfloat16" # Path to the granite model in the aliryan vm on AWS model_path = "/home/ec2-user/instructlab/models/instructlab/granite-7b-lab" #TODO: all 57 tasks need to be parameterized possibly by CLI - tasks = "mmlu_abstract_algebra" - mmlu = MMLU_Evaluator(model, model_args, model_path, tasks, 2, 5) + tasks = ["mmlu_abstract_algebra","mmlu_anatomy","mmlu_astronomy"] + dtype = "float16" + mmlu = MMLU_Evaluator(model_path, tasks, dtype, 2, 5) + overall_score, individual_scores = mmlu.run() + print(overall_score) + print(individual_scores) if __name__ == "__main__": main() From 05233b4688de4de0ddcd9e248766bd391a23a8a3 Mon Sep 17 00:00:00 2001 From: Alina Ryan Date: Tue, 25 Jun 2024 11:23:20 -0400 Subject: [PATCH 3/7] Remove testing code and change the MMLU class names and descriptions Signed-off-by: Alina Ryan --- src/instructlab/eval/mmlu.py | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index b06084c..4eb767e 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -7,7 +7,7 @@ from lm_eval.evaluator import simple_evaluate import os -class MMLU_Evaluator(Evaluator): +class MMLUEvaluator(Evaluator): """ Child class of an Evaluator for Massive Multitask Language Understanding (MMLU) @@ -62,31 +62,14 @@ def run(self) -> tuple: overall_score = float(agg_score/len(self.tasks)) return overall_score, individual_scores - - -############# Testing Code Follows ############## -def main(): - # Path to the granite model in the aliryan vm on AWS - model_path = "/home/ec2-user/instructlab/models/instructlab/granite-7b-lab" - #TODO: all 57 tasks need to be parameterized possibly by CLI - tasks = ["mmlu_abstract_algebra","mmlu_anatomy","mmlu_astronomy"] - dtype = "float16" - mmlu = MMLU_Evaluator(model_path, tasks, dtype, 2, 5) - overall_score, individual_scores = mmlu.run() - print(overall_score) - print(individual_scores) - -if __name__ == "__main__": - main() -############# Testing Code Ends ############## - -class PR_MMLU_Evaluator(Evaluator): + +class MMLUBranchEvaluator(Evaluator): """ - Child class of an Evaluator for PR Massive Multitask Language Understanding (PR MMLU) + Child class of an Evaluator for Massive Multitask Language Understanding Branch (MMLUBranch) Attributes: - sdg_path path where all the PR MMLU tasks are stored - task group name that is shared by all the PR MMLU tasks + sdg_path path where all the MMLUBranch tasks are stored + task group name that is shared by all the MMLUBranch tasks few_shots number of examples batch_size number of GPUs """ @@ -107,11 +90,11 @@ def __init__( def run(self) -> tuple: """ - Runs PR MMLU evaluation + Runs MMLUBranch evaluation Returns: - overall_score PR MMLU score for the overall model evaluation - individual_scores Individual PR MMLU scores for each task + overall_score MMLUBranch score for the overall model evaluation + individual_scores Individual MMLUBranch scores for each task qa_pairs Question and answer pairs from the evaluation """ individual_scores: dict[str, float] = {} From 2e607f5a47a6442d74d7ccbeb86b8e6f4eb1f2c2 Mon Sep 17 00:00:00 2001 From: Alina Ryan Date: Tue, 25 Jun 2024 13:41:48 -0400 Subject: [PATCH 4/7] Add missing model_path param and description Signed-off-by: Alina Ryan --- src/instructlab/eval/mmlu.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index 4eb767e..244e647 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -27,6 +27,7 @@ def __init__( self.model_dtype = model_dtype self.few_shots = few_shots self.batch_size = batch_size + self.model_path = model_path def run(self) -> tuple: """ @@ -68,6 +69,7 @@ class MMLUBranchEvaluator(Evaluator): Child class of an Evaluator for Massive Multitask Language Understanding Branch (MMLUBranch) Attributes: + model_path absolute path to or name of a huggingface model sdg_path path where all the MMLUBranch tasks are stored task group name that is shared by all the MMLUBranch tasks few_shots number of examples @@ -82,6 +84,7 @@ def __init__( few_shots: int = 2, batch_size: int = 5, ) -> None: + super().__init__() self.model_path = model_path self.sdg_path = sdg_path self.task = task From 270bf538d49e67919f3ddbe1efd664dd5bd0d23f Mon Sep 17 00:00:00 2001 From: Alina Ryan Date: Tue, 25 Jun 2024 14:19:29 -0400 Subject: [PATCH 5/7] Fix lint errors Signed-off-by: Alina Ryan --- src/instructlab/eval/mmlu.py | 38 ++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index 244e647..c092364 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -1,11 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 -# Local -from instructlab.eval.evaluator import Evaluator +# Standard +import os -# Third Party +# Third Party from lm_eval.evaluator import simple_evaluate -import os + +# First Party +from instructlab.eval.evaluator import Evaluator + class MMLUEvaluator(Evaluator): """ @@ -20,14 +23,19 @@ class MMLUEvaluator(Evaluator): """ def __init__( - self, model_path, tasks: list[str], model_dtype = 'bfloat16', few_shots: int = 2, batch_size: int = 5 + self, + model_path, + tasks: list[str], + model_dtype="bfloat16", + few_shots: int = 2, + batch_size: int = 5, ) -> None: - super().__init__(model_path) + super().__init__() + self.model_path = model_path self.tasks = tasks self.model_dtype = model_dtype self.few_shots = few_shots self.batch_size = batch_size - self.model_path = model_path def run(self) -> tuple: """ @@ -37,8 +45,8 @@ def run(self) -> tuple: overall_score MMLU score for the overall model evaluation individual_scores Individual MMLU score for each task """ - #TODO: make this a parameter for class? - os.environ['TOKENIZERS_PARALLELISM'] = 'true' + # TODO: make this a parameter for class? + os.environ["TOKENIZERS_PARALLELISM"] = "true" individual_scores: dict = {} agg_score: float = 0.0 @@ -49,21 +57,22 @@ def run(self) -> tuple: model_args=model_args, tasks=self.tasks, num_fewshot=self.few_shots, - batch_size=self.batch_size + batch_size=self.batch_size, ) results = mmlu_output["results"] for task in self.tasks: mmlu_res = results[task] - agg_score += float(mmlu_res['acc,none']) + agg_score += float(mmlu_res["acc,none"]) individual_scores[task] = {} - individual_scores[task]['score'] = float(mmlu_res['acc,none']) - individual_scores[task]['stderr'] = float(mmlu_res['acc_stderr,none']) + individual_scores[task]["score"] = float(mmlu_res["acc,none"]) + individual_scores[task]["stderr"] = float(mmlu_res["acc_stderr,none"]) - overall_score = float(agg_score/len(self.tasks)) + overall_score = float(agg_score / len(self.tasks)) return overall_score, individual_scores + class MMLUBranchEvaluator(Evaluator): """ Child class of an Evaluator for Massive Multitask Language Understanding Branch (MMLUBranch) @@ -84,7 +93,6 @@ def __init__( few_shots: int = 2, batch_size: int = 5, ) -> None: - super().__init__() self.model_path = model_path self.sdg_path = sdg_path self.task = task From 88b2e658252821f155a779d9387d996707644caf Mon Sep 17 00:00:00 2001 From: Alina Ryan Date: Tue, 25 Jun 2024 16:25:30 -0400 Subject: [PATCH 6/7] suppress import err Signed-off-by: Alina Ryan --- src/instructlab/eval/mmlu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index c092364..7d96855 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -4,7 +4,7 @@ import os # Third Party -from lm_eval.evaluator import simple_evaluate +from lm_eval.evaluator import simple_evaluate # type: ignore # First Party from instructlab.eval.evaluator import Evaluator From 899aaf9f06a7376fef03d02fd802a5174de2fb20 Mon Sep 17 00:00:00 2001 From: Alina Ryan Date: Tue, 25 Jun 2024 17:07:21 -0400 Subject: [PATCH 7/7] Add fstring Signed-off-by: Alina Ryan --- src/instructlab/eval/mmlu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index 7d96855..b95f476 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -50,7 +50,7 @@ def run(self) -> tuple: individual_scores: dict = {} agg_score: float = 0.0 - model_args = "pretrained=" + self.model_path + ",dtype=" + self.model_dtype + model_args = f"pretrained= {self.model_path}, dtype= {self.model_dtype}" mmlu_output = simple_evaluate( model="hf",