Skip to content

Commit

Permalink
Merge pull request #10 from alinaryan/mmlu
Browse files Browse the repository at this point in the history
Implement `MMLU_Evaluator.run()`
  • Loading branch information
alimaredia authored Jun 25, 2024
2 parents ffe1aa1 + 899aaf9 commit 100c512
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 13 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ transformers
accelerate
pandas
pandas-stubs
lm-eval>=0.4.2
65 changes: 52 additions & 13 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,39 @@
# SPDX-License-Identifier: Apache-2.0

# Local
from .evaluator import Evaluator
# Standard
import os

# Third Party
from lm_eval.evaluator import simple_evaluate # type: ignore

class MMLU_Evaluator(Evaluator):
# First Party
from instructlab.eval.evaluator import Evaluator


class MMLUEvaluator(Evaluator):
"""
Child class of an Evaluator for Massive Multitask Language Understanding (MMLU)
Attributes:
model_path absolute path to or name of a huggingface model
tasks list of tasks for MMLU to test the model with
model_dtype dtype of model when served
few_shots number of examples
batch_size number of GPUs
"""

def __init__(
self, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5
self,
model_path,
tasks: list[str],
model_dtype="bfloat16",
few_shots: int = 2,
batch_size: int = 5,
) -> None:
super().__init__()
self.model_path = model_path
self.tasks = tasks
self.model_dtype = model_dtype
self.few_shots = few_shots
self.batch_size = batch_size

Expand All @@ -30,18 +45,42 @@ def run(self) -> tuple:
overall_score MMLU score for the overall model evaluation
individual_scores Individual MMLU score for each task
"""
individual_scores: dict[str, float] = {}
overall_score: float = 0.0
# TODO: make this a parameter for class?
os.environ["TOKENIZERS_PARALLELISM"] = "true"

individual_scores: dict = {}
agg_score: float = 0.0
model_args = f"pretrained= {self.model_path}, dtype= {self.model_dtype}"

mmlu_output = simple_evaluate(
model="hf",
model_args=model_args,
tasks=self.tasks,
num_fewshot=self.few_shots,
batch_size=self.batch_size,
)

results = mmlu_output["results"]

for task in self.tasks:
mmlu_res = results[task]
agg_score += float(mmlu_res["acc,none"])
individual_scores[task] = {}
individual_scores[task]["score"] = float(mmlu_res["acc,none"])
individual_scores[task]["stderr"] = float(mmlu_res["acc_stderr,none"])

overall_score = float(agg_score / len(self.tasks))
return overall_score, individual_scores


class PR_MMLU_Evaluator(Evaluator):
class MMLUBranchEvaluator(Evaluator):
"""
Child class of an Evaluator for PR Massive Multitask Language Understanding (PR MMLU)
Child class of an Evaluator for Massive Multitask Language Understanding Branch (MMLUBranch)
Attributes:
sdg_path path where all the PR MMLU tasks are stored
task group name that is shared by all the PR MMLU tasks
model_path absolute path to or name of a huggingface model
sdg_path path where all the MMLUBranch tasks are stored
task group name that is shared by all the MMLUBranch tasks
few_shots number of examples
batch_size number of GPUs
"""
Expand All @@ -62,11 +101,11 @@ def __init__(

def run(self) -> tuple:
"""
Runs PR MMLU evaluation
Runs MMLUBranch evaluation
Returns:
overall_score PR MMLU score for the overall model evaluation
individual_scores Individual PR MMLU scores for each task
overall_score MMLUBranch score for the overall model evaluation
individual_scores Individual MMLUBranch scores for each task
qa_pairs Question and answer pairs from the evaluation
"""
individual_scores: dict[str, float] = {}
Expand Down

0 comments on commit 100c512

Please sign in to comment.