-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from nathan-weinberg/skeleton
Initial skeleton for Evaluator classes and exceptions
- Loading branch information
Showing
5 changed files
with
173 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,8 @@ | ||
# eval | ||
|
||
![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main) | ||
![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main) | ||
![Release](https://img.shields.io/github/v/release/instructlab/eval) | ||
![License](https://img.shields.io/github/license/instructlab/eval) | ||
|
||
Python library for Evaluation |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
||
class Evaluator: | ||
""" | ||
Parent class for Evaluators | ||
Atttributes: | ||
model_path Path to the model to be evaluated | ||
""" | ||
|
||
def __init__(self, model_path: str) -> None: | ||
self.model_path = model_path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
||
class EvalError(Exception): | ||
""" | ||
Parent class for all of instructlab-eval exceptions | ||
""" | ||
|
||
|
||
class ModelNotFoundError(EvalError): | ||
""" | ||
Exception raised when model is not able to be found | ||
Attributes | ||
message error message to be printed on raise | ||
model model that is being operated on | ||
path filepath of model location | ||
""" | ||
|
||
def __init__(self, path) -> None: | ||
super().__init__() | ||
self.path = path | ||
self.model = path.rsplit("/")[-1] | ||
self.message = f"Model {self.model} could not be found at {self.path}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
# Local | ||
from .evaluator import Evaluator | ||
|
||
|
||
class MMLU_Evaluator(Evaluator): | ||
""" | ||
Child class of an Evaluator for Massive Multitask Language Understanding (MMLU) | ||
Attributes: | ||
tasks list of tasks for MMLU to test the model with | ||
few_shots number of examples | ||
batch_size number of GPUs | ||
""" | ||
|
||
def __init__( | ||
self, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5 | ||
) -> None: | ||
super().__init__(model_path) | ||
self.tasks = tasks | ||
self.few_shots = few_shots | ||
self.batch_size = batch_size | ||
|
||
def run(self) -> tuple: | ||
""" | ||
Runs MMLU evaluation | ||
Returns: | ||
overall_score MMLU score for the overall model evaluation | ||
individual_scores Individual MMLU score for each task | ||
""" | ||
individual_scores: dict[str, float] = {} | ||
overall_score: float = 0.0 | ||
return overall_score, individual_scores | ||
|
||
|
||
class PR_MMLU_Evaluator(Evaluator): | ||
""" | ||
Child class of an Evaluator for PR Massive Multitask Language Understanding (PR MMLU) | ||
Attributes: | ||
sdg_path path where all the PR MMLU tasks are stored | ||
task group name that is shared by all the PR MMLU tasks | ||
few_shots number of examples | ||
batch_size number of GPUs | ||
""" | ||
|
||
def __init__( | ||
self, | ||
model_path, | ||
sdg_path: str, | ||
task: str = "mmlu_pr", | ||
few_shots: int = 2, | ||
batch_size: int = 5, | ||
) -> None: | ||
super().__init__(model_path) | ||
self.sdg_path = sdg_path | ||
self.task = task | ||
self.few_shots = few_shots | ||
self.batch_size = batch_size | ||
|
||
def run(self) -> tuple: | ||
""" | ||
Runs PR MMLU evaluation | ||
Returns: | ||
overall_score PR MMLU score for the overall model evaluation | ||
individual_scores Individual PR MMLU scores for each task | ||
qa_pairs Question and answer pairs from the evaluation | ||
""" | ||
individual_scores: dict[str, float] = {} | ||
overall_score: float = 0.0 | ||
qa_pairs: list[tuple] = [] | ||
return overall_score, individual_scores, qa_pairs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
# Local | ||
from .evaluator import Evaluator | ||
|
||
|
||
class MT_Bench_Evaluator(Evaluator): | ||
""" | ||
Child class of an Evaluator for Multi-turn Benchmark (MT-Bench) | ||
Attributes | ||
server_url vLLM server endpoint | ||
""" | ||
|
||
def __init__(self, model_path, server_url: str) -> None: | ||
super().__init__(model_path) | ||
self.server_url = server_url | ||
|
||
def run(self) -> tuple: | ||
""" | ||
Runs MT-Bench evaluation | ||
Returns: | ||
overall_score MT-Bench score for the overall model evaluation | ||
qa_pairs Question and answer pairs from the evaluation | ||
""" | ||
overall_score: float = 0.0 | ||
qa_pairs: list[tuple] = [] | ||
return overall_score, qa_pairs | ||
|
||
|
||
class PR_Bench_Evaluator(Evaluator): | ||
""" | ||
Child class of an Evaluator for PR-Bench Benchmark (PR-Bench) | ||
Attributes | ||
server_url vLLM server endpoint | ||
questions questions to be asked | ||
""" | ||
|
||
def __init__(self, model_path, server_url: str, questions: str) -> None: | ||
super().__init__(model_path) | ||
self.server_url = server_url | ||
self.questions = questions | ||
|
||
def run(self) -> tuple: | ||
""" | ||
Runs PR-Bench evaluation | ||
Returns: | ||
overall_score MT-Bench score for the overall model evaluation | ||
qa_pairs Question and answer pairs from the evaluation | ||
""" | ||
overall_score = 0.0 | ||
qa_pairs: list[tuple] = [] | ||
return overall_score, qa_pairs |