From a9eed0a012d15df3cf801d6c96b1198f3a9f2db1 Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Thu, 12 Dec 2024 04:18:54 +0000 Subject: [PATCH] feat: allow MMLU to pass `system_prompt` to lm_eval Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- .pylintrc | 3 ++- CHANGELOG.md | 4 ++++ src/instructlab/eval/mmlu.py | 30 +++++++++++++++++++++++------- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/.pylintrc b/.pylintrc index 01a605d..14b3eb1 100644 --- a/.pylintrc +++ b/.pylintrc @@ -448,7 +448,8 @@ disable=raw-checker-failed, pointless-statement, wrong-import-order, line-too-long, - dangerous-default-value + dangerous-default-value, + too-many-instance-attributes # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/CHANGELOG.md b/CHANGELOG.md index a897297..755516c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.2 + +* Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt. + ## 0.4 * Added ability to specify a custom http client to MT-Bench diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index f893b66..8637ad4 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -102,6 +102,7 @@ class AbstractMMLUEvaluator(Evaluator): few_shots number of examples batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. device PyTorch device (e.g. "cpu" or "cuda:0") for running models + system_prompt system prompt to be used when applying the chat template """ def __init__( @@ -113,8 +114,10 @@ def __init__( few_shots: int = 5, batch_size: Optional[Union[int, str]] = "auto", device: str = ("cuda" if torch.cuda.is_available() else "cpu"), + system_prompt: Optional[str] = None, ) -> None: self.model_path = model_path + self.system_prompt = system_prompt self.tasks_dir = tasks_dir self.tasks = tasks self.model_dtype = model_dtype @@ -168,6 +171,7 @@ def _run_mmlu(self, server_url: str | None = None) -> dict: if not os.access(self.tasks_dir, os.R_OK): raise InvalidTasksDirError(self.tasks_dir) tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir) + should_apply_chat_template = self.system_prompt is not None mmlu_output = self._simple_evaluate_with_error_handling( model=model, model_args=model_args, @@ -176,6 +180,8 @@ def _run_mmlu(self, server_url: str | None = None) -> dict: batch_size=self.batch_size, device=self.device, task_manager=tm, + system_instruction=self.system_prompt, + apply_chat_template=should_apply_chat_template, ) results = mmlu_output["results"] return results @@ -213,12 +219,13 @@ class MMLUEvaluator(AbstractMMLUEvaluator): Evaluator for Massive Multitask Language Understanding (MMLU) Attributes: - model_path absolute path to or name of a huggingface model - tasks list of tasks for MMLU to test the model with - model_dtype dtype of model when served - few_shots number of examples - batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. - device PyTorch device (e.g. "cpu" or "cuda:0") for running models + model_path absolute path to or name of a huggingface model + tasks list of tasks for MMLU to test the model with + model_dtype dtype of model when served + few_shots number of examples + batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. + device PyTorch device (e.g. "cpu" or "cuda:0") for running models + system_prompt system prompt to be used when applying the chat template """ name = "mmlu" @@ -231,9 +238,17 @@ def __init__( few_shots: int = 5, batch_size: Optional[Union[int, str]] = "auto", device: str = ("cuda" if torch.cuda.is_available() else "cpu"), + system_prompt: Optional[str] = None, ) -> None: super().__init__( - model_path, None, tasks, model_dtype, few_shots, batch_size, device + model_path, + None, + tasks, + model_dtype, + few_shots, + batch_size, + device, + system_prompt=system_prompt, ) @@ -243,6 +258,7 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator): Attributes: model_path absolute path to or name of a huggingface model + system_prompt system prompt to be used when applying the chat template tasks_dir path where the .jsonl and _task.yaml files for the branches being evaluated are stored tasks group name that is shared by all the MMLUBranch tasks model_dtype dtype of model when served