Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial skeleton for Evaluator classes and exceptions #6

Merged
merged 11 commits into from
Jun 17, 2024
13 changes: 13 additions & 0 deletions src/instructlab/eval/evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# SPDX-License-Identifier: Apache-2.0


class Evaluator:
"""
Parent class for Evaluators

Atttributes:
model The model to be evaluated
nathan-weinberg marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(self, model: str) -> None:
self.model = model
21 changes: 21 additions & 0 deletions src/instructlab/eval/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# SPDX-License-Identifier: Apache-2.0


class EvalError(Exception):
"""
Parent class for all of instructlab-eval exceptions
"""


class ModelNotFoundError(EvalError):
"""
Exception raised when model is not able to be found

Attributes
model model that is being operated on
"""

def __init__(self, model) -> None:
super().__init__()
self.model = model
nathan-weinberg marked this conversation as resolved.
Show resolved Hide resolved
self.message = f"Model {self.model} could not be found"
69 changes: 69 additions & 0 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# SPDX-License-Identifier: Apache-2.0

# Local
from .evaluator import Evaluator


class MMLU_Evaluator(Evaluator):
"""
Child class of an Evaluator for Massive Multitask Language Understanding (MMLU)

Attributes:
tasks list of tasks for MMLU to test the model with
fewshots number of examples
nathan-weinberg marked this conversation as resolved.
Show resolved Hide resolved
batchsize number of GPUs
nathan-weinberg marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(
self, model, tasks: list[str], fewshots: int = 2, batchsize: int = 5
) -> None:
super().__init__(model)
self.tasks = tasks
self.fewshots = fewshots
self.batchsize = batchsize

def run(self) -> dict:
individual_scores: dict[str, float] = {}
overall_score: float = 0.0
payload = {
"individual_scores": individual_scores,
"overall_score": overall_score,
nathan-weinberg marked this conversation as resolved.
Show resolved Hide resolved
}
return payload


class PR_MMLU_Evaluator(Evaluator):
"""
Child class of an Evaluator for PR Massive Multitask Language Understanding (PR MMLU)

Attributes:
sdg_path path where all the PR MMLU tasks are stored
task group name that is shared by all the PR MMLU tasks
fewshots number of examples
batchsize number of GPUs
"""

def __init__(
self,
model,
sdg_path: str,
task: str = "mmlu_pr",
fewshots: int = 2,
batchsize: int = 5,
) -> None:
super().__init__(model)
self.sdg_path = sdg_path
self.task = task
self.fewshots = fewshots
self.batchsize = batchsize

def run(self) -> dict:
individual_scores: dict[str, float] = {}
overall_score: float = 0.0
qa_pairs: list[tuple] = []
payload = {
"individual_scores": individual_scores,
"overall_score": overall_score,
"qa_pairs": qa_pairs,
}
return payload
44 changes: 44 additions & 0 deletions src/instructlab/eval/mtbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# SPDX-License-Identifier: Apache-2.0

# Local
from .evaluator import Evaluator


class MT_Bench_Evaluator(Evaluator):
"""
Child class of an Evaluator for Multi-turn Benchmark (MT-Bench)

Attributes
server_url vLLM server endpoint
nathan-weinberg marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(self, model, server_url: str) -> None:
super().__init__(model)
self.server_url = server_url

def run(self) -> dict:
overall_score: float = 0.0
qa_pairs: list[tuple] = []
payload = {"overall_score": overall_score, "qa_pairs": qa_pairs}
return payload


class PR_Bench_Evaluator(Evaluator):
"""
Child class of an Evaluator for PR-Bench Benchmark (PR-Bench)

Attributes
server_url vLLM server endpoint
questions questions to be asked
nathan-weinberg marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(self, model, server_url: str, questions: str) -> None:
super().__init__(model)
self.server_url = server_url
self.questions = questions

def run(self) -> dict:
overall_score = 0.0
qa_pairs: list[tuple] = []
payload = {"overall_score": overall_score, "qa_pairs": qa_pairs}
return payload