Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial skeleton for Evaluator classes and exceptions #6

Merged
merged 11 commits into from
Jun 17, 2024
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# eval

![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main)
![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main)
![Release](https://img.shields.io/github/v/release/instructlab/eval)
![License](https://img.shields.io/github/license/instructlab/eval)

Python library for Evaluation
13 changes: 13 additions & 0 deletions src/instructlab/eval/evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# SPDX-License-Identifier: Apache-2.0


class Evaluator:
"""
Parent class for Evaluators

Atttributes:
model_path Path to the model to be evaluated
"""

def __init__(self, model_path: str) -> None:
self.model_path = model_path
24 changes: 24 additions & 0 deletions src/instructlab/eval/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# SPDX-License-Identifier: Apache-2.0


class EvalError(Exception):
"""
Parent class for all of instructlab-eval exceptions
"""


class ModelNotFoundError(EvalError):
"""
Exception raised when model is not able to be found

Attributes
message error message to be printed on raise
model model that is being operated on
path filepath of model location
"""

def __init__(self, path) -> None:
super().__init__()
self.path = path
self.model = path.rsplit("/")[-1]
self.message = f"Model {self.model} could not be found at {self.path}"
75 changes: 75 additions & 0 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# SPDX-License-Identifier: Apache-2.0

# Local
from .evaluator import Evaluator


class MMLU_Evaluator(Evaluator):
"""
Child class of an Evaluator for Massive Multitask Language Understanding (MMLU)

Attributes:
tasks list of tasks for MMLU to test the model with
few_shots number of examples
batch_size number of GPUs
nathan-weinberg marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(
self, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5
) -> None:
super().__init__(model_path)
self.tasks = tasks
self.few_shots = few_shots
self.batch_size = batch_size

def run(self) -> tuple:
"""
Runs MMLU evaluation

Returns:
overall_score MMLU score for the overall model evaluation
individual_scores Individual MMLU score for each task
"""
individual_scores: dict[str, float] = {}
overall_score: float = 0.0
return overall_score, individual_scores


class PR_MMLU_Evaluator(Evaluator):
"""
Child class of an Evaluator for PR Massive Multitask Language Understanding (PR MMLU)

Attributes:
sdg_path path where all the PR MMLU tasks are stored
task group name that is shared by all the PR MMLU tasks
few_shots number of examples
batch_size number of GPUs
"""

def __init__(
self,
model_path,
sdg_path: str,
task: str = "mmlu_pr",
few_shots: int = 2,
batch_size: int = 5,
) -> None:
super().__init__(model_path)
self.sdg_path = sdg_path
self.task = task
self.few_shots = few_shots
self.batch_size = batch_size

def run(self) -> tuple:
"""
Runs PR MMLU evaluation

Returns:
overall_score PR MMLU score for the overall model evaluation
individual_scores Individual PR MMLU scores for each task
qa_pairs Question and answer pairs from the evaluation
"""
individual_scores: dict[str, float] = {}
overall_score: float = 0.0
qa_pairs: list[tuple] = []
return overall_score, individual_scores, qa_pairs
56 changes: 56 additions & 0 deletions src/instructlab/eval/mtbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# SPDX-License-Identifier: Apache-2.0

# Local
from .evaluator import Evaluator


class MT_Bench_Evaluator(Evaluator):
"""
Child class of an Evaluator for Multi-turn Benchmark (MT-Bench)

Attributes
server_url vLLM server endpoint
nathan-weinberg marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(self, model_path, server_url: str) -> None:
super().__init__(model_path)
self.server_url = server_url

def run(self) -> tuple:
nathan-weinberg marked this conversation as resolved.
Show resolved Hide resolved
"""
Runs MT-Bench evaluation

Returns:
overall_score MT-Bench score for the overall model evaluation
qa_pairs Question and answer pairs from the evaluation
"""
overall_score: float = 0.0
qa_pairs: list[tuple] = []
return overall_score, qa_pairs


class PR_Bench_Evaluator(Evaluator):
"""
Child class of an Evaluator for PR-Bench Benchmark (PR-Bench)

Attributes
server_url vLLM server endpoint
questions questions to be asked
nathan-weinberg marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(self, model_path, server_url: str, questions: str) -> None:
super().__init__(model_path)
self.server_url = server_url
self.questions = questions

def run(self) -> tuple:
"""
Runs PR-Bench evaluation

Returns:
overall_score MT-Bench score for the overall model evaluation
qa_pairs Question and answer pairs from the evaluation
"""
overall_score = 0.0
qa_pairs: list[tuple] = []
return overall_score, qa_pairs