Skip to content

Commit

Permalink
Remove task logic with lm_eval 0.4.4 for agg_score
Browse files Browse the repository at this point in the history
lm_eval used to return an extra entry that corresponded to the tasks requested. Ex: mmlu_pr.  As of 0.4.4 the entries are now the same whether the tasks are custom are not and the extra entry is removed.  So the agg score now needs to be calculated from the individual task scores returned so the logic can be shared with mmluevaluator.

Signed-off-by: Dan McPherson <[email protected]>
  • Loading branch information
danmcp committed Oct 1, 2024
1 parent 40cc370 commit b896d0a
Showing 1 changed file with 31 additions and 64 deletions.
95 changes: 31 additions & 64 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,37 @@ def __init__(
self.batch_size = batch_size
self.device = device

def run(self, server_url: str | None = None) -> tuple:
"""
Runs evaluation
Attributes
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
Returns:
overall_score Average score for the task group
individual_scores Individual scores for each task in the task group
"""
logger.debug(locals())

# TODO: make this a parameter for class?
os.environ["TOKENIZERS_PARALLELISM"] = "true"

individual_scores: dict = {}
agg_score: float = 0.0

results = self._run_mmlu(server_url)
for task, result in results.items():
agg_score += float(result["acc,none"])
individual_scores[task] = {
"score": float(result["acc,none"]),
"stderr": float(result["acc_stderr,none"]),
}

overall_score = float(agg_score / len(self.tasks))

return overall_score, individual_scores

def _run_mmlu(self, server_url: str | None = None) -> dict:
if server_url is not None:
# Requires lm_eval >= 0.4.4
Expand Down Expand Up @@ -205,36 +236,6 @@ def __init__(
model_path, None, tasks, model_dtype, few_shots, batch_size, device
)

def run(self, server_url: str | None = None) -> tuple:
"""
Runs MMLU evaluation
Attributes
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
Returns:
overall_score MMLU score for the overall model evaluation
individual_scores Individual MMLU score for each task
"""
logger.debug(locals())
# TODO: make this a parameter for class?
os.environ["TOKENIZERS_PARALLELISM"] = "true"

individual_scores: dict = {}
agg_score: float = 0.0

results = self._run_mmlu(server_url)

for task in self.tasks:
mmlu_res = results[task]
agg_score += float(mmlu_res["acc,none"])
individual_scores[task] = {}
individual_scores[task]["score"] = float(mmlu_res["acc,none"])
individual_scores[task]["stderr"] = float(mmlu_res["acc_stderr,none"])

overall_score = float(agg_score / len(self.tasks))
return overall_score, individual_scores


class MMLUBranchEvaluator(AbstractMMLUEvaluator):
"""
Expand All @@ -251,37 +252,3 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):
"""

name = "mmlu_branch"

def run(self, server_url: str | None = None) -> tuple:
"""
Runs MMLUBranch evaluation
Attributes
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
Returns:
overall_score Average MMLUBranch score for the task group
individual_scores Individual MMLUBranch scores for each task in the task group
"""
logger.debug(locals())

# TODO: make this a parameter for class?
os.environ["TOKENIZERS_PARALLELISM"] = "true"

individual_scores: dict = {}
agg_score: float = 0.0

results = self._run_mmlu(server_url)

for task, result in results.items():
if task in self.tasks:
agg_score += float(result["acc,none"])
else:
individual_scores[task] = {
"score": float(result["acc,none"]),
"stderr": float(result["acc_stderr,none"]),
}

overall_score = float(agg_score / len(self.tasks))

return overall_score, individual_scores

0 comments on commit b896d0a

Please sign in to comment.