instructlab · nathan-weinberg · Jun 17, 2024 · Jun 13, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/src/instructlab/eval/evaluator.py b/src/instructlab/eval/evaluator.py
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+
+
+class Evaluator:
+    """
+    Parent class for Evaluators
+
+    Atttributes:
+        model_path   Path to the model to be evaluated
+    """
+
+    def __init__(self, model_path: str) -> None:
+        self.model_path = model_path
diff --git a/src/instructlab/eval/exceptions.py b/src/instructlab/eval/exceptions.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+
+
+class EvalError(Exception):
+    """
+    Parent class for all of instructlab-eval exceptions
+    """
+
+
+class ModelNotFoundError(EvalError):
+    """
+    Exception raised when model is not able to be found
+
+    Attributes
+        model   model that is being operated on
+    """
+
+    def __init__(self, model) -> None:
+        super().__init__()
+        self.model = model
+        self.message = f"Model {self.model} could not be found"
diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Local
+from .evaluator import Evaluator
+
+
+class MMLU_Evaluator(Evaluator):
+    """
+    Child class of an Evaluator for Massive Multitask Language Understanding (MMLU)
+
+    Attributes:
+        tasks       list of tasks for MMLU to test the model with
+        few_shots    number of examples
+        batch_size   number of GPUs
+    """
+
+    def __init__(
+        self, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5
+    ) -> None:
+        super().__init__(model_path)
+        self.tasks = tasks
+        self.few_shots = few_shots
+        self.batch_size = batch_size
+
+    def run(self) -> tuple:
+        individual_scores: dict[str, float] = {}
+        overall_score: float = 0.0
+        return overall_score, individual_scores
+
+
+class PR_MMLU_Evaluator(Evaluator):
+    """
+    Child class of an Evaluator for PR Massive Multitask Language Understanding (PR MMLU)
+
+    Attributes:
+        sdg_path    path where all the PR MMLU tasks are stored
+        task        group name that is shared by all the PR MMLU tasks
+        few_shots   number of examples
+        batch_size  number of GPUs
+    """
+
+    def __init__(
+        self,
+        model_path,
+        sdg_path: str,
+        task: str = "mmlu_pr",
+        few_shots: int = 2,
+        batch_size: int = 5,
+    ) -> None:
+        super().__init__(model_path)
+        self.sdg_path = sdg_path
+        self.task = task
+        self.few_shots = few_shots
+        self.batch_size = batch_size
+
+    def run(self) -> tuple:
+        individual_scores: dict[str, float] = {}
+        overall_score: float = 0.0
+        qa_pairs: list[tuple] = []
+        return overall_score, individual_scores, qa_pairs
diff --git a/src/instructlab/eval/mtbench.py b/src/instructlab/eval/mtbench.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Local
+from .evaluator import Evaluator
+
+
+class MT_Bench_Evaluator(Evaluator):
+    """
+    Child class of an Evaluator for Multi-turn Benchmark (MT-Bench)
+
+    Attributes
+        server_url  vLLM server endpoint
+    """
+
+    def __init__(self, model_path, server_url: str) -> None:
+        super().__init__(model_path)
+        self.server_url = server_url
+
+    def run(self) -> tuple:
+        overall_score: float = 0.0
+        qa_pairs: list[tuple] = []
+        return overall_score, qa_pairs
+
+
+class PR_Bench_Evaluator(Evaluator):
+    """
+    Child class of an Evaluator for PR-Bench Benchmark (PR-Bench)
+
+    Attributes
+        server_url  vLLM server endpoint
+        questions   questions to be asked
+    """
+
+    def __init__(self, model_path, server_url: str, questions: str) -> None:
+        super().__init__(model_path)
+        self.server_url = server_url
+        self.questions = questions
+
+    def run(self) -> tuple:
+        overall_score = 0.0
+        qa_pairs: list[tuple] = []
+        return overall_score, qa_pairs