diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index 2b35923..820b067 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -9,7 +9,7 @@ class MMLU_Evaluator(Evaluator): Child class of an Evaluator for Massive Multitask Language Understanding (MMLU) Attributes: - tasks list of tasks for MMLU to test the model with + tasks list of tasks for MMLU to test the model with few_shots number of examples batch_size number of GPUs """ @@ -23,6 +23,13 @@ def __init__( self.batch_size = batch_size def run(self) -> tuple: + """ + Runs MMLU evaluation + + Returns: + overall_score MMLU score for the overall model evaluation + individual_scores Individual MMLU score for each task + """ individual_scores: dict[str, float] = {} overall_score: float = 0.0 return overall_score, individual_scores @@ -54,6 +61,14 @@ def __init__( self.batch_size = batch_size def run(self) -> tuple: + """ + Runs PR MMLU evaluation + + Returns: + overall_score PR MMLU score for the overall model evaluation + individual_scores Individual PR MMLU scores for each task + qa_pairs Question and answer pairs from the evaluation + """ individual_scores: dict[str, float] = {} overall_score: float = 0.0 qa_pairs: list[tuple] = [] diff --git a/src/instructlab/eval/mtbench.py b/src/instructlab/eval/mtbench.py index 25469ba..1d8f586 100644 --- a/src/instructlab/eval/mtbench.py +++ b/src/instructlab/eval/mtbench.py @@ -17,6 +17,13 @@ def __init__(self, model_path, server_url: str) -> None: self.server_url = server_url def run(self) -> tuple: + """ + Runs MT-Bench evaluation + + Returns: + overall_score MT-Bench score for the overall model evaluation + qa_pairs Question and answer pairs from the evaluation + """ overall_score: float = 0.0 qa_pairs: list[tuple] = [] return overall_score, qa_pairs @@ -37,6 +44,13 @@ def __init__(self, model_path, server_url: str, questions: str) -> None: self.questions = questions def run(self) -> tuple: + """ + Runs PR-Bench evaluation + + Returns: + overall_score MT-Bench score for the overall model evaluation + qa_pairs Question and answer pairs from the evaluation + """ overall_score = 0.0 qa_pairs: list[tuple] = [] return overall_score, qa_pairs