diff --git a/requirements.txt b/requirements.txt index bc02276..eaa9b0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ transformers accelerate pandas pandas-stubs +lm-eval>=0.4.2 diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index edeea0f..b95f476 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -1,24 +1,39 @@ # SPDX-License-Identifier: Apache-2.0 -# Local -from .evaluator import Evaluator +# Standard +import os +# Third Party +from lm_eval.evaluator import simple_evaluate # type: ignore -class MMLU_Evaluator(Evaluator): +# First Party +from instructlab.eval.evaluator import Evaluator + + +class MMLUEvaluator(Evaluator): """ Child class of an Evaluator for Massive Multitask Language Understanding (MMLU) Attributes: + model_path absolute path to or name of a huggingface model tasks list of tasks for MMLU to test the model with + model_dtype dtype of model when served few_shots number of examples batch_size number of GPUs """ def __init__( - self, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5 + self, + model_path, + tasks: list[str], + model_dtype="bfloat16", + few_shots: int = 2, + batch_size: int = 5, ) -> None: + super().__init__() self.model_path = model_path self.tasks = tasks + self.model_dtype = model_dtype self.few_shots = few_shots self.batch_size = batch_size @@ -30,18 +45,42 @@ def run(self) -> tuple: overall_score MMLU score for the overall model evaluation individual_scores Individual MMLU score for each task """ - individual_scores: dict[str, float] = {} - overall_score: float = 0.0 + # TODO: make this a parameter for class? + os.environ["TOKENIZERS_PARALLELISM"] = "true" + + individual_scores: dict = {} + agg_score: float = 0.0 + model_args = f"pretrained= {self.model_path}, dtype= {self.model_dtype}" + + mmlu_output = simple_evaluate( + model="hf", + model_args=model_args, + tasks=self.tasks, + num_fewshot=self.few_shots, + batch_size=self.batch_size, + ) + + results = mmlu_output["results"] + + for task in self.tasks: + mmlu_res = results[task] + agg_score += float(mmlu_res["acc,none"]) + individual_scores[task] = {} + individual_scores[task]["score"] = float(mmlu_res["acc,none"]) + individual_scores[task]["stderr"] = float(mmlu_res["acc_stderr,none"]) + + overall_score = float(agg_score / len(self.tasks)) return overall_score, individual_scores -class PR_MMLU_Evaluator(Evaluator): +class MMLUBranchEvaluator(Evaluator): """ - Child class of an Evaluator for PR Massive Multitask Language Understanding (PR MMLU) + Child class of an Evaluator for Massive Multitask Language Understanding Branch (MMLUBranch) Attributes: - sdg_path path where all the PR MMLU tasks are stored - task group name that is shared by all the PR MMLU tasks + model_path absolute path to or name of a huggingface model + sdg_path path where all the MMLUBranch tasks are stored + task group name that is shared by all the MMLUBranch tasks few_shots number of examples batch_size number of GPUs """ @@ -62,11 +101,11 @@ def __init__( def run(self) -> tuple: """ - Runs PR MMLU evaluation + Runs MMLUBranch evaluation Returns: - overall_score PR MMLU score for the overall model evaluation - individual_scores Individual PR MMLU scores for each task + overall_score MMLUBranch score for the overall model evaluation + individual_scores Individual MMLUBranch scores for each task qa_pairs Question and answer pairs from the evaluation """ individual_scores: dict[str, float] = {}