Skip to content

Commit

Permalink
feat: allow MMLU to pass system_prompt to lm_eval
Browse files Browse the repository at this point in the history
Signed-off-by: Oleg S <[email protected]>
  • Loading branch information
RobotSail committed Dec 12, 2024
1 parent 4cf3e14 commit a9eed0a
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 8 deletions.
3 changes: 2 additions & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,8 @@ disable=raw-checker-failed,
pointless-statement,
wrong-import-order,
line-too-long,
dangerous-default-value
dangerous-default-value,
too-many-instance-attributes

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.4.2

* Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt.

## 0.4

* Added ability to specify a custom http client to MT-Bench
Expand Down
30 changes: 23 additions & 7 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class AbstractMMLUEvaluator(Evaluator):
few_shots number of examples
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
system_prompt system prompt to be used when applying the chat template
"""

def __init__(
Expand All @@ -113,8 +114,10 @@ def __init__(
few_shots: int = 5,
batch_size: Optional[Union[int, str]] = "auto",
device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
system_prompt: Optional[str] = None,
) -> None:
self.model_path = model_path
self.system_prompt = system_prompt
self.tasks_dir = tasks_dir
self.tasks = tasks
self.model_dtype = model_dtype
Expand Down Expand Up @@ -168,6 +171,7 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
if not os.access(self.tasks_dir, os.R_OK):
raise InvalidTasksDirError(self.tasks_dir)
tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir)
should_apply_chat_template = self.system_prompt is not None
mmlu_output = self._simple_evaluate_with_error_handling(
model=model,
model_args=model_args,
Expand All @@ -176,6 +180,8 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
batch_size=self.batch_size,
device=self.device,
task_manager=tm,
system_instruction=self.system_prompt,
apply_chat_template=should_apply_chat_template,
)
results = mmlu_output["results"]
return results
Expand Down Expand Up @@ -213,12 +219,13 @@ class MMLUEvaluator(AbstractMMLUEvaluator):
Evaluator for Massive Multitask Language Understanding (MMLU)
Attributes:
model_path absolute path to or name of a huggingface model
tasks list of tasks for MMLU to test the model with
model_dtype dtype of model when served
few_shots number of examples
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
model_path absolute path to or name of a huggingface model
tasks list of tasks for MMLU to test the model with
model_dtype dtype of model when served
few_shots number of examples
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
system_prompt system prompt to be used when applying the chat template
"""

name = "mmlu"
Expand All @@ -231,9 +238,17 @@ def __init__(
few_shots: int = 5,
batch_size: Optional[Union[int, str]] = "auto",
device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
system_prompt: Optional[str] = None,
) -> None:
super().__init__(
model_path, None, tasks, model_dtype, few_shots, batch_size, device
model_path,
None,
tasks,
model_dtype,
few_shots,
batch_size,
device,
system_prompt=system_prompt,
)


Expand All @@ -243,6 +258,7 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):
Attributes:
model_path absolute path to or name of a huggingface model
system_prompt system prompt to be used when applying the chat template
tasks_dir path where the <TASK_NAME>.jsonl and <TASK_NAME>_task.yaml files for the branches being evaluated are stored
tasks group name that is shared by all the MMLUBranch tasks
model_dtype dtype of model when served
Expand Down

0 comments on commit a9eed0a

Please sign in to comment.