diff --git a/README.md b/README.md index da139cf..0e34e5f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ # eval +![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main) +![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main) +![Release](https://img.shields.io/github/v/release/instructlab/eval) +![License](https://img.shields.io/github/license/instructlab/eval) + Python library for Evaluation diff --git a/src/instructlab/eval/evaluator.py b/src/instructlab/eval/evaluator.py new file mode 100644 index 0000000..3bd51d5 --- /dev/null +++ b/src/instructlab/eval/evaluator.py @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: Apache-2.0 + + +class Evaluator: + """ + Parent class for Evaluators + + Atttributes: + model_path Path to the model to be evaluated + """ + + def __init__(self, model_path: str) -> None: + self.model_path = model_path diff --git a/src/instructlab/eval/exceptions.py b/src/instructlab/eval/exceptions.py new file mode 100644 index 0000000..caa45b0 --- /dev/null +++ b/src/instructlab/eval/exceptions.py @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: Apache-2.0 + + +class EvalError(Exception): + """ + Parent class for all of instructlab-eval exceptions + """ + + +class ModelNotFoundError(EvalError): + """ + Exception raised when model is not able to be found + + Attributes + message error message to be printed on raise + model model that is being operated on + path filepath of model location + """ + + def __init__(self, path) -> None: + super().__init__() + self.path = path + self.model = path.rsplit("/")[-1] + self.message = f"Model {self.model} could not be found at {self.path}" diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py new file mode 100644 index 0000000..820b067 --- /dev/null +++ b/src/instructlab/eval/mmlu.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Local +from .evaluator import Evaluator + + +class MMLU_Evaluator(Evaluator): + """ + Child class of an Evaluator for Massive Multitask Language Understanding (MMLU) + + Attributes: + tasks list of tasks for MMLU to test the model with + few_shots number of examples + batch_size number of GPUs + """ + + def __init__( + self, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5 + ) -> None: + super().__init__(model_path) + self.tasks = tasks + self.few_shots = few_shots + self.batch_size = batch_size + + def run(self) -> tuple: + """ + Runs MMLU evaluation + + Returns: + overall_score MMLU score for the overall model evaluation + individual_scores Individual MMLU score for each task + """ + individual_scores: dict[str, float] = {} + overall_score: float = 0.0 + return overall_score, individual_scores + + +class PR_MMLU_Evaluator(Evaluator): + """ + Child class of an Evaluator for PR Massive Multitask Language Understanding (PR MMLU) + + Attributes: + sdg_path path where all the PR MMLU tasks are stored + task group name that is shared by all the PR MMLU tasks + few_shots number of examples + batch_size number of GPUs + """ + + def __init__( + self, + model_path, + sdg_path: str, + task: str = "mmlu_pr", + few_shots: int = 2, + batch_size: int = 5, + ) -> None: + super().__init__(model_path) + self.sdg_path = sdg_path + self.task = task + self.few_shots = few_shots + self.batch_size = batch_size + + def run(self) -> tuple: + """ + Runs PR MMLU evaluation + + Returns: + overall_score PR MMLU score for the overall model evaluation + individual_scores Individual PR MMLU scores for each task + qa_pairs Question and answer pairs from the evaluation + """ + individual_scores: dict[str, float] = {} + overall_score: float = 0.0 + qa_pairs: list[tuple] = [] + return overall_score, individual_scores, qa_pairs diff --git a/src/instructlab/eval/mtbench.py b/src/instructlab/eval/mtbench.py new file mode 100644 index 0000000..1d8f586 --- /dev/null +++ b/src/instructlab/eval/mtbench.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Local +from .evaluator import Evaluator + + +class MT_Bench_Evaluator(Evaluator): + """ + Child class of an Evaluator for Multi-turn Benchmark (MT-Bench) + + Attributes + server_url vLLM server endpoint + """ + + def __init__(self, model_path, server_url: str) -> None: + super().__init__(model_path) + self.server_url = server_url + + def run(self) -> tuple: + """ + Runs MT-Bench evaluation + + Returns: + overall_score MT-Bench score for the overall model evaluation + qa_pairs Question and answer pairs from the evaluation + """ + overall_score: float = 0.0 + qa_pairs: list[tuple] = [] + return overall_score, qa_pairs + + +class PR_Bench_Evaluator(Evaluator): + """ + Child class of an Evaluator for PR-Bench Benchmark (PR-Bench) + + Attributes + server_url vLLM server endpoint + questions questions to be asked + """ + + def __init__(self, model_path, server_url: str, questions: str) -> None: + super().__init__(model_path) + self.server_url = server_url + self.questions = questions + + def run(self) -> tuple: + """ + Runs PR-Bench evaluation + + Returns: + overall_score MT-Bench score for the overall model evaluation + qa_pairs Question and answer pairs from the evaluation + """ + overall_score = 0.0 + qa_pairs: list[tuple] = [] + return overall_score, qa_pairs