From 3443ffacb8cf0183bb1b7ee0036f656afe6e167b Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Fri, 6 Dec 2024 11:22:15 -0500
Subject: [PATCH 1/6] adds basic ragas eval

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 requirements.txt              |  1 +
 src/instructlab/eval/ragas.py | 80 +++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 src/instructlab/eval/ragas.py

diff --git a/requirements.txt b/requirements.txt
index a3e6e7d..0853899 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ pandas
 pandas-stubs
 lm-eval>=0.4.4
 httpx
+ragas
diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
new file mode 100644
index 0000000..3579289
--- /dev/null
+++ b/src/instructlab/eval/ragas.py
@@ -0,0 +1,80 @@
+# Standard
+from typing import List, TypedDict
+
+# Third Party
+from langchain_community.chat_models import ChatOpenAI
+from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
+from ragas.metrics import RubricsScore
+from ragas.metrics._domain_specific_rubrics import DEFAULT_WITH_REFERENCE_RUBRICS
+
+# Local
+from .evaluator import Evaluator
+
+
+class Sample(TypedDict):
+    # question
+    user_input: str
+
+    # model answer
+    response: str
+
+    # golden answer
+    reference: str
+
+
+class RagasEvaluator(Evaluator):
+    # most basic implementation, we just assume that the user will bring the existing model responses
+    name = "ragas"
+
+    def __init__(self):
+        pass
+
+    def run(
+        self, dataset: List[Sample], run_config: RunConfig | None = None
+    ) -> EvaluationResult:
+        """
+        Evaluates the quality of model responses against a graded rubric.
+
+        Args:
+            dataset (List[Sample]):
+                List of model questions and answers
+            run_config (RunConfig | None, optional):
+                Configuration to use when running evaluations. If none is provided, then
+                a default one is created containing extremely permissive settings when handling
+                timeouts. This is because by default, OpenAI tier-1 usage accounts have very high
+                rate limits resulting in heavy throttling during evaluations.
+
+        Returns:
+            EvaluationResult: The results of all evaluations performed by Ragas
+        """
+        if not run_config:
+            # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
+            # are horrible and will result in half of our evaluation results being NaN or 0
+            run_config = RunConfig(
+                max_retries=120,
+                max_wait=7200,
+                seed=42,
+                timeout=3600,
+            )
+
+        # we will be using gpt-4o for the foreseeable future, we hardcode this
+        # for consistency of answers
+        input_ds = EvaluationDataset.from_list(dataset)
+
+        # default set of metrics
+        metrics = [
+            RubricsScore(
+                rubrics=DEFAULT_WITH_REFERENCE_RUBRICS,
+            )
+        ]
+
+        critic_lm = ChatOpenAI(model="gpt-4o")
+        results = evaluate(
+            dataset=input_ds,
+            batch_size=4,
+            run_config=run_config,
+            llm=critic_lm,
+            metrics=metrics,
+            show_progress=True,
+        )
+        return results

From 8568b139df0601b3feff563c0e85d3fecfda4c14 Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Fri, 6 Dec 2024 17:07:15 -0500
Subject: [PATCH 2/6] feat: add ability for ragas to read from a list

We want ragas to read from both a list as well as a list of samples

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/eval/ragas.py | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
index 3579289..91686d3 100644
--- a/src/instructlab/eval/ragas.py
+++ b/src/instructlab/eval/ragas.py
@@ -1,11 +1,15 @@
 # Standard
+from pathlib import Path
 from typing import List, TypedDict
 
 # Third Party
 from langchain_community.chat_models import ChatOpenAI
 from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
-from ragas.metrics import RubricsScore
-from ragas.metrics._domain_specific_rubrics import DEFAULT_WITH_REFERENCE_RUBRICS
+from ragas.metrics._domain_specific_rubrics import (  # the rubrics we must instantiate are located inside of a file marked as private
+    DEFAULT_WITH_REFERENCE_RUBRICS,
+    RubricsScore,
+)
+import pandas as pd
 
 # Local
 from .evaluator import Evaluator
@@ -30,13 +34,13 @@ def __init__(self):
         pass
 
     def run(
-        self, dataset: List[Sample], run_config: RunConfig | None = None
+        self, dataset: List[Sample] | Path = None, run_config: RunConfig | None = None
     ) -> EvaluationResult:
         """
         Evaluates the quality of model responses against a graded rubric.
 
         Args:
-            dataset (List[Sample]):
+            dataset (List[Sample] | Path):
                 List of model questions and answers
             run_config (RunConfig | None, optional):
                 Configuration to use when running evaluations. If none is provided, then
@@ -47,6 +51,19 @@ def run(
         Returns:
             EvaluationResult: The results of all evaluations performed by Ragas
         """
+        if not dataset:
+            raise ValueError(
+                "no dataset was provided, please specify the `dataset` argument"
+            )
+        if isinstance(dataset, Path):
+            input_ds = EvaluationDataset.from_pandas(
+                pd.read_json(dataset, lines=True, orient="records")
+            )
+        elif isinstance(dataset, list):
+            input_ds = EvaluationDataset.from_list(dataset)
+        else:
+            raise TypeError(f"invalid type passed for dataset: {type(dataset)}")
+
         if not run_config:
             # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
             # are horrible and will result in half of our evaluation results being NaN or 0
@@ -57,10 +74,6 @@ def run(
                 timeout=3600,
             )
 
-        # we will be using gpt-4o for the foreseeable future, we hardcode this
-        # for consistency of answers
-        input_ds = EvaluationDataset.from_list(dataset)
-
         # default set of metrics
         metrics = [
             RubricsScore(
@@ -68,6 +81,8 @@ def run(
             )
         ]
 
+        # we will be using gpt-4o for the foreseeable future, we hardcode this
+        # for consistency of answers
         critic_lm = ChatOpenAI(model="gpt-4o")
         results = evaluate(
             dataset=input_ds,

From 58880c359025614ecab7cd209eeacf6968e7b465 Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Fri, 6 Dec 2024 22:20:58 -0500
Subject: [PATCH 3/6] feat: add ability for answers to be generated from user
 questions

When a dataset is provided and is missing the `response` field, we will need to generate these responses. This commit ensures that when this case happens, we will error out when a student model is not configured. Otherwise, we will always generate these responses if the student model exists, regardless if `response` is in the dataframe or not.

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/eval/ragas.py | 166 +++++++++++++++++++++++++++++-----
 1 file changed, 143 insertions(+), 23 deletions(-)

diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
index 91686d3..df7943c 100644
--- a/src/instructlab/eval/ragas.py
+++ b/src/instructlab/eval/ragas.py
@@ -1,47 +1,114 @@
 # Standard
 from pathlib import Path
-from typing import List, TypedDict
+from typing import List, Optional, TypedDict
 
 # Third Party
 from langchain_community.chat_models import ChatOpenAI
+from pandas import DataFrame, read_json
+from pydantic import BaseModel, ConfigDict, field_validator
 from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
+from ragas.metrics import Metric
 from ragas.metrics._domain_specific_rubrics import (  # the rubrics we must instantiate are located inside of a file marked as private
     DEFAULT_WITH_REFERENCE_RUBRICS,
     RubricsScore,
 )
-import pandas as pd
 
 # Local
 from .evaluator import Evaluator
+from .mt_bench_common import get_openai_client
 
 
 class Sample(TypedDict):
+    """
+    TypedDict of a sample that we accept when doing eval with Ragas.
+    We specifically use TypedDict here to be flexible with the input data we accept.
+    """
+
     # question
     user_input: str
 
     # model answer
-    response: str
+    response: Optional[str]
 
     # golden answer
     reference: str
 
 
+# default system prompt we'll use when none is provided. Make it private as we don't intend this to be a public object
+_DEFAULT_SYSTEM_PROMPT = """You are an advanced AI assistant designed to provide precise and accurate information.
+Your primary goal is to answer queries with the most up-to-date and factual information available.
+Focus on delivering clear, concise, and correct responses.
+If you're uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.
+Your responses should prioritize accuracy over all other considerations."""
+
+DEFAULT_SEED = 1337
+DEFAULT_JUDGE_MODEL = "gpt-4o"
+
+
+class ModelConfig(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+
+    # URL of the OpenAI server where the model shall be hosted.
+    base_url: str
+
+    # name of the model to use.
+    model_name: str
+    
+    # The system prompt to be used when applying the chat template.
+    system_prompt: str = _DEFAULT_SYSTEM_PROMPT
+
+    # We do NOT read from OPENAI_API_KEY for the student model for security reasons (e.g. sending the API key to another client)
+    # To provide an OpenAI key, you must set it here; else the default is used.
+    api_key: str = "no-api-key"
+
+    # "model randomness" aka likelihood of sampling something other than the likeliest token
+    temperature: float = 0.0
+
+    # Max amount of tokens to generate.
+    max_tokens: int = 768
+
+    # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes. 
+    seed: int = DEFAULT_SEED
+
+    @field_validator("temperature")
+    @classmethod
+    def check_temperature(cls, v: float) -> float:
+        if not 0.0 <= v <= 1.0:
+            raise ValueError("temperature must be between 0.0 and 1.0")
+        return v
+
+
 class RagasEvaluator(Evaluator):
     # most basic implementation, we just assume that the user will bring the existing model responses
     name = "ragas"
 
-    def __init__(self):
-        pass
+    def __init__(
+        self,
+        student_model: ModelConfig | None = None,
+        run_config: RunConfig | None = None,
+    ):
+        self.student_model = student_model
+        self.run_config = run_config
 
     def run(
-        self, dataset: List[Sample] | Path = None, run_config: RunConfig | None = None
+        self,
+        dataset: List[Sample] | Path,
+        student_model: ModelConfig | None = None,
+        run_config: RunConfig | None = None,
     ) -> EvaluationResult:
         """
         Evaluates the quality of model responses against a graded rubric.
 
+        When the `dataset` lacks the `response` field, then `student_model` must be provided
+        in order to generate the answers.
+
         Args:
             dataset (List[Sample] | Path):
-                List of model questions and answers
+                Can be either a list of `Sample` objects or a path to a jsonl file containing
+                records matching `Sample`.
+            student_model: (StudentModelConfig):
+                When this parameter is provided, we'll attempt to use the described model in order to
+                generate the responses from the given list of questions.
             run_config (RunConfig | None, optional):
                 Configuration to use when running evaluations. If none is provided, then
                 a default one is created containing extremely permissive settings when handling
@@ -51,18 +118,36 @@ def run(
         Returns:
             EvaluationResult: The results of all evaluations performed by Ragas
         """
+        student_model = student_model if student_model else self.student_model
+        run_config = run_config if run_config else self.run_config
+
         if not dataset:
             raise ValueError(
                 "no dataset was provided, please specify the `dataset` argument"
             )
-        if isinstance(dataset, Path):
-            input_ds = EvaluationDataset.from_pandas(
-                pd.read_json(dataset, lines=True, orient="records")
+
+        if type(dataset) not in (list, Path):
+            raise TypeError(f"invalid type of dataset: {type(dataset)}")
+
+        # ensure we are in the dataframe format
+        input_df = None
+        if isinstance(dataset, list):
+            input_df = DataFrame(dataset)
+        elif isinstance(dataset, Path):
+            input_df = read_json(dataset, orient="records", lines=True)
+
+        # this should never happen, but pylint is not smart enough to detect it
+        assert input_df is not None
+
+        need_to_generate_questions = "response" not in input_df.columns
+        if need_to_generate_questions and not student_model:
+            raise ValueError(
+                "provided dataset doesn't contain the model `response`, but no `student_model` was provided for inference"
             )
-        elif isinstance(dataset, list):
-            input_ds = EvaluationDataset.from_list(dataset)
-        else:
-            raise TypeError(f"invalid type passed for dataset: {type(dataset)}")
+
+        # if the student model was provided then we always generate regardless
+        if student_model:
+            input_df = self._generate_answers_from_model(input_df, student_model)
 
         if not run_config:
             # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
@@ -70,22 +155,18 @@ def run(
             run_config = RunConfig(
                 max_retries=120,
                 max_wait=7200,
-                seed=42,
+                seed=DEFAULT_SEED,
                 timeout=3600,
             )
 
-        # default set of metrics
-        metrics = [
-            RubricsScore(
-                rubrics=DEFAULT_WITH_REFERENCE_RUBRICS,
-            )
-        ]
+        metrics = self._get_metrics()
+        evaluation_ds = EvaluationDataset.from_pandas(input_df)
 
         # we will be using gpt-4o for the foreseeable future, we hardcode this
         # for consistency of answers
-        critic_lm = ChatOpenAI(model="gpt-4o")
+        critic_lm = ChatOpenAI(model=DEFAULT_JUDGE_MODEL)
         results = evaluate(
-            dataset=input_ds,
+            dataset=evaluation_ds,
             batch_size=4,
             run_config=run_config,
             llm=critic_lm,
@@ -93,3 +174,42 @@ def run(
             show_progress=True,
         )
         return results
+
+    def _generate_answers_from_model(
+        self, questions: DataFrame, student_model: ModelConfig
+    ) -> DataFrame:
+        """
+        Given a DataFrame containing `user_input` columns, generates responses from the given model
+        and returns a new DataFrame containing its answers in the `response` column.
+        """
+        client = get_openai_client(
+            model_api_base=student_model.base_url, api_key=student_model.api_key
+        )
+
+        # initialize response to write into
+        updated_df = questions.copy()
+        updated_df["response"] = ""
+
+        for i, qna in updated_df.iterrows():
+            messages = [
+                student_model.system_prompt,
+                qna["user_input"],
+            ]
+            response = client.chat.completions.create(
+                messages=messages,
+                model=student_model.model_name,
+                # specify the seed so we can at least try to have some reproducibility when the clients support it
+                seed=42,
+                max_tokens=student_model.max_tokens,
+                temperature=student_model.temperature,
+            )
+            updated_df.at[i, "response"] = response.choices[0].message.content
+        return updated_df
+
+    def _get_metrics(self) -> List[Metric]:
+        # default set of metrics
+        return [
+            RubricsScore(
+                rubrics=DEFAULT_WITH_REFERENCE_RUBRICS,
+            )
+        ]

From 04117dd3487a3934fef189348c0825a3b8458b68 Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Fri, 13 Dec 2024 13:04:37 -0500
Subject: [PATCH 4/6] chore: add unit tests for ragas evaluator

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/eval/ragas.py |  10 +--
 tests/test_ragas.py           | 161 ++++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_ragas.py

diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
index df7943c..c7ac148 100644
--- a/src/instructlab/eval/ragas.py
+++ b/src/instructlab/eval/ragas.py
@@ -1,3 +1,4 @@
+# # SPDX-License-Identifier: Apache-2.0
 # Standard
 from pathlib import Path
 from typing import List, Optional, TypedDict
@@ -53,7 +54,7 @@ class ModelConfig(BaseModel):
 
     # name of the model to use.
     model_name: str
-    
+
     # The system prompt to be used when applying the chat template.
     system_prompt: str = _DEFAULT_SYSTEM_PROMPT
 
@@ -67,7 +68,7 @@ class ModelConfig(BaseModel):
     # Max amount of tokens to generate.
     max_tokens: int = 768
 
-    # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes. 
+    # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes.
     seed: int = DEFAULT_SEED
 
     @field_validator("temperature")
@@ -126,15 +127,14 @@ def run(
                 "no dataset was provided, please specify the `dataset` argument"
             )
 
-        if type(dataset) not in (list, Path):
-            raise TypeError(f"invalid type of dataset: {type(dataset)}")
-
         # ensure we are in the dataframe format
         input_df = None
         if isinstance(dataset, list):
             input_df = DataFrame(dataset)
         elif isinstance(dataset, Path):
             input_df = read_json(dataset, orient="records", lines=True)
+        else:
+            raise TypeError(f"invalid type of dataset: {type(dataset)}")
 
         # this should never happen, but pylint is not smart enough to detect it
         assert input_df is not None
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
new file mode 100644
index 0000000..e2667a3
--- /dev/null
+++ b/tests/test_ragas.py
@@ -0,0 +1,161 @@
+# # SPDX-License-Identifier: Apache-2.0
+# Standard
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+import unittest
+
+# Third Party
+from pandas import DataFrame
+from ragas.callbacks import ChainRun
+from ragas.dataset_schema import EvaluationDataset, EvaluationResult
+import pandas as pd
+
+# First Party
+from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig, Sample
+
+
+class TestRagasEvaluator(unittest.TestCase):
+    @patch("instructlab.eval.ragas.get_openai_client")
+    def test_generate_answers_from_model(self, mock_get_openai_client):
+        # mock the OpenAI client to always return "london" for chat completions
+        mock_client = MagicMock()
+        mock_response = MagicMock()
+        mock_response.choices[0].message.content = "London"
+        mock_client.chat.completions.create.return_value = mock_response
+        mock_get_openai_client.return_value = mock_client
+
+        # get answers
+        questions = pd.DataFrame({"user_input": ["What is the capital of France?"]})
+        student_model = ModelConfig(
+            base_url="https://your.model.endpoint.com",
+            model_name="jeeves-512B",
+            api_key="test-api-key",
+        )
+        evaluator = RagasEvaluator()
+        result_df = evaluator._generate_answers_from_model(questions, student_model)
+
+        # what we expect to see
+        expected_df = questions.copy()
+        expected_df["response"] = ["London"]
+
+        # perform the assertions
+        pd.testing.assert_frame_equal(result_df, expected_df)
+        mock_get_openai_client.assert_called_once_with(
+            model_api_base=student_model.base_url, api_key=student_model.api_key
+        )
+        mock_client.chat.completions.create.assert_called_once_with(
+            messages=[student_model.system_prompt, "What is the capital of France?"],
+            model=student_model.model_name,
+            seed=42,
+            max_tokens=student_model.max_tokens,
+            temperature=student_model.temperature,
+        )
+
+    @patch("instructlab.eval.ragas.read_json")
+    @patch("instructlab.eval.ragas.evaluate")
+    @patch("instructlab.eval.ragas.ChatOpenAI")
+    @patch.object(RagasEvaluator, "_generate_answers_from_model")
+    @patch.object(RagasEvaluator, "_get_metrics")
+    def test_run(
+        self,
+        mock_get_metrics: MagicMock,
+        mock_generate_answers_from_model: MagicMock,
+        mock_ChatOpenAI: MagicMock,
+        mock_evaluate: MagicMock,
+        mock_read_json: MagicMock,
+    ):
+        ########################################################################
+        # SETUP EVERYTHING WE NEED FOR THE TESTS
+        ########################################################################
+
+        # These are the variables which will control the flow of the test.
+        # Since we have to re-construct some Ragas components under the hood,
+
+        student_model_response = "Paris"
+        user_question = "What is the capital of France?"
+        golden_answer = "The capital of France is Paris."
+        base_ds = [{"user_input": user_question, "reference": golden_answer}]
+        mocked_metric = "mocked-metric"
+        mocked_metric_score = 4.0
+
+        # The following section takes care of mocking function return calls.
+        # Ragas is tricky because it has some complex data structures under the hood,
+        # so what we have to do is configure the intermediate outputs that we expect
+        # to receive from Ragas.
+
+        mock_get_metrics.return_value = [mocked_metric]
+        interim_df = DataFrame(
+            {
+                "user_input": [user_question],
+                "response": [student_model_response],
+                "reference": [golden_answer],
+            }
+        )
+        mock_generate_answers_from_model.return_value = interim_df.copy()
+        mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
+        mock_ChatOpenAI.return_value = MagicMock()
+
+        # Ragas requires this value to instantiate an EvaluationResult object, so we must provide it.
+        # It isn't functionally used for our purposes though.
+
+        _unimportant_ragas_traces = {
+            "default": ChainRun(
+                run_id="42",
+                parent_run_id=None,
+                name="root",
+                inputs={"system": "null", "user": "null"},
+                outputs={"assistant": "null"},
+                metadata={"user_id": 1337},
+            )
+        }
+        mock_evaluate.return_value = EvaluationResult(
+            scores=[{mocked_metric: mocked_metric_score}],
+            dataset=mocked_evaluation_ds,
+            ragas_traces=_unimportant_ragas_traces,
+        )
+
+        ########################################################################
+        # Run the tests
+        ########################################################################
+
+        # Configure all other inputs that Ragas does not depend on for proper mocking
+        student_model = ModelConfig(
+            base_url="https://api.openai.com",
+            model_name="pt-3.5-turbo",
+            api_key="test-api-key",
+        )
+        run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)
+        evaluator = RagasEvaluator()
+
+        ########################################################################
+        # Test case: directly passing a dataset
+        ########################################################################
+        result = evaluator.run(
+            dataset=base_ds, student_model=student_model, run_config=run_config
+        )
+
+        self.assertIsInstance(result, EvaluationResult)
+        mock_generate_answers_from_model.assert_called_once()
+        mock_evaluate.assert_called_once()
+        mock_ChatOpenAI.assert_called_once_with(model="gpt-4o")
+
+        ########################################################################
+        # Test case: passing a dataset in via Path to JSONL file
+        ########################################################################
+        mock_read_json.return_value = DataFrame(base_ds)
+        result = evaluator.run(
+            dataset=Path("dummy_path.jsonl"),
+            student_model=student_model,
+            run_config=run_config,
+        )
+
+        self.assertIsInstance(result, EvaluationResult)
+        mock_read_json.assert_called_once_with(
+            Path("dummy_path.jsonl"), orient="records", lines=True
+        )
+        mock_generate_answers_from_model.assert_called()
+        mock_evaluate.assert_called()
+
+
+if __name__ == "__main__":
+    unittest.main()

From c6b5a70cbb5804f3f145782e0f84d88ea28e774b Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Fri, 13 Dec 2024 21:54:08 +0000
Subject: [PATCH 5/6] feat: update the Ragas evaluator to have the OpenAI
 client as something that gets passed in to __init__

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/eval/ragas.py | 39 ++++++++-------
 tests/test_ragas.py           | 89 +++++++++++++++++++++--------------
 2 files changed, 75 insertions(+), 53 deletions(-)

diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
index c7ac148..9515a95 100644
--- a/src/instructlab/eval/ragas.py
+++ b/src/instructlab/eval/ragas.py
@@ -5,6 +5,7 @@
 
 # Third Party
 from langchain_community.chat_models import ChatOpenAI
+from openai import Client as OpenAIClient
 from pandas import DataFrame, read_json
 from pydantic import BaseModel, ConfigDict, field_validator
 from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
@@ -16,7 +17,6 @@
 
 # Local
 from .evaluator import Evaluator
-from .mt_bench_common import get_openai_client
 
 
 class Sample(TypedDict):
@@ -49,19 +49,12 @@ class Sample(TypedDict):
 class ModelConfig(BaseModel):
     model_config = ConfigDict(protected_namespaces=())
 
-    # URL of the OpenAI server where the model shall be hosted.
-    base_url: str
-
     # name of the model to use.
     model_name: str
 
     # The system prompt to be used when applying the chat template.
     system_prompt: str = _DEFAULT_SYSTEM_PROMPT
 
-    # We do NOT read from OPENAI_API_KEY for the student model for security reasons (e.g. sending the API key to another client)
-    # To provide an OpenAI key, you must set it here; else the default is used.
-    api_key: str = "no-api-key"
-
     # "model randomness" aka likelihood of sampling something other than the likeliest token
     temperature: float = 0.0
 
@@ -87,15 +80,18 @@ def __init__(
         self,
         student_model: ModelConfig | None = None,
         run_config: RunConfig | None = None,
+        openai_client: OpenAIClient | None = None,
     ):
         self.student_model = student_model
         self.run_config = run_config
+        self.openai_client = openai_client
 
     def run(
         self,
         dataset: List[Sample] | Path,
         student_model: ModelConfig | None = None,
         run_config: RunConfig | None = None,
+        openai_client: OpenAIClient | None = None,
     ) -> EvaluationResult:
         """
         Evaluates the quality of model responses against a graded rubric.
@@ -115,12 +111,16 @@ def run(
                 a default one is created containing extremely permissive settings when handling
                 timeouts. This is because by default, OpenAI tier-1 usage accounts have very high
                 rate limits resulting in heavy throttling during evaluations.
+            openai_client (openai.Client | None, optional):
+                The client to use when generating questions from the student model, must be compatible with the OpenAI API.
+                This field is required when `student_model` is provided.
 
         Returns:
             EvaluationResult: The results of all evaluations performed by Ragas
         """
         student_model = student_model if student_model else self.student_model
         run_config = run_config if run_config else self.run_config
+        openai_client = openai_client if openai_client else self.openai_client
 
         if not dataset:
             raise ValueError(
@@ -140,14 +140,20 @@ def run(
         assert input_df is not None
 
         need_to_generate_questions = "response" not in input_df.columns
-        if need_to_generate_questions and not student_model:
+        if need_to_generate_questions and (not student_model or not openai_client):
             raise ValueError(
-                "provided dataset doesn't contain the model `response`, but no `student_model` was provided for inference"
+                "provided dataset doesn't contain the model `response`, but either `student_model` or `openai_client` wasn't provided for inference"
             )
 
         # if the student model was provided then we always generate regardless
         if student_model:
-            input_df = self._generate_answers_from_model(input_df, student_model)
+            if not openai_client:
+                raise ValueError(
+                    "`student_model` was specified but `openai_client` was not provided"
+                )
+            input_df = self._generate_answers_from_model(
+                input_df, student_model, openai_client
+            )
 
         if not run_config:
             # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
@@ -176,16 +182,15 @@ def run(
         return results
 
     def _generate_answers_from_model(
-        self, questions: DataFrame, student_model: ModelConfig
+        self,
+        questions: DataFrame,
+        student_model: ModelConfig,
+        openai_client: OpenAIClient,
     ) -> DataFrame:
         """
         Given a DataFrame containing `user_input` columns, generates responses from the given model
         and returns a new DataFrame containing its answers in the `response` column.
         """
-        client = get_openai_client(
-            model_api_base=student_model.base_url, api_key=student_model.api_key
-        )
-
         # initialize response to write into
         updated_df = questions.copy()
         updated_df["response"] = ""
@@ -195,7 +200,7 @@ def _generate_answers_from_model(
                 student_model.system_prompt,
                 qna["user_input"],
             ]
-            response = client.chat.completions.create(
+            response = openai_client.chat.completions.create(
                 messages=messages,
                 model=student_model.model_name,
                 # specify the seed so we can at least try to have some reproducibility when the clients support it
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
index e2667a3..ebabb2b 100644
--- a/tests/test_ragas.py
+++ b/tests/test_ragas.py
@@ -11,58 +11,55 @@
 import pandas as pd
 
 # First Party
-from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig, Sample
+from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig
 
 
 class TestRagasEvaluator(unittest.TestCase):
-    @patch("instructlab.eval.ragas.get_openai_client")
-    def test_generate_answers_from_model(self, mock_get_openai_client):
+    def test_generate_answers_from_model(self):
         # mock the OpenAI client to always return "london" for chat completions
+        user_input = "What is the capital of France?"
+        model_response = "London"
         mock_client = MagicMock()
         mock_response = MagicMock()
-        mock_response.choices[0].message.content = "London"
+        mock_response.choices = [MagicMock(message=MagicMock(content=model_response))]
         mock_client.chat.completions.create.return_value = mock_response
-        mock_get_openai_client.return_value = mock_client
 
         # get answers
-        questions = pd.DataFrame({"user_input": ["What is the capital of France?"]})
+        questions = pd.DataFrame({"user_input": [user_input]})
         student_model = ModelConfig(
-            base_url="https://your.model.endpoint.com",
-            model_name="jeeves-512B",
-            api_key="test-api-key",
+            model_name="super-jeeves-8x700B",
         )
         evaluator = RagasEvaluator()
-        result_df = evaluator._generate_answers_from_model(questions, student_model)
+        result_df = evaluator._generate_answers_from_model(
+            questions, student_model, mock_client
+        )
 
         # what we expect to see
         expected_df = questions.copy()
-        expected_df["response"] = ["London"]
+        expected_df["response"] = [model_response]
 
         # perform the assertions
         pd.testing.assert_frame_equal(result_df, expected_df)
-        mock_get_openai_client.assert_called_once_with(
-            model_api_base=student_model.base_url, api_key=student_model.api_key
-        )
         mock_client.chat.completions.create.assert_called_once_with(
-            messages=[student_model.system_prompt, "What is the capital of France?"],
+            messages=[student_model.system_prompt, user_input],
             model=student_model.model_name,
             seed=42,
             max_tokens=student_model.max_tokens,
             temperature=student_model.temperature,
         )
 
+    @patch("instructlab.eval.ragas.ChatOpenAI")
     @patch("instructlab.eval.ragas.read_json")
     @patch("instructlab.eval.ragas.evaluate")
-    @patch("instructlab.eval.ragas.ChatOpenAI")
     @patch.object(RagasEvaluator, "_generate_answers_from_model")
     @patch.object(RagasEvaluator, "_get_metrics")
     def test_run(
         self,
         mock_get_metrics: MagicMock,
         mock_generate_answers_from_model: MagicMock,
-        mock_ChatOpenAI: MagicMock,
         mock_evaluate: MagicMock,
         mock_read_json: MagicMock,
+        mock_ChatOpenAI: MagicMock,
     ):
         ########################################################################
         # SETUP EVERYTHING WE NEED FOR THE TESTS
@@ -74,16 +71,20 @@ def test_run(
         student_model_response = "Paris"
         user_question = "What is the capital of France?"
         golden_answer = "The capital of France is Paris."
+        metric = "mocked-metric"
+        metric_score = 4.0
         base_ds = [{"user_input": user_question, "reference": golden_answer}]
-        mocked_metric = "mocked-metric"
-        mocked_metric_score = 4.0
+        student_model = ModelConfig(
+            model_name="super-jeeves-8x700B",
+        )
+        run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)
 
         # The following section takes care of mocking function return calls.
         # Ragas is tricky because it has some complex data structures under the hood,
         # so what we have to do is configure the intermediate outputs that we expect
         # to receive from Ragas.
 
-        mock_get_metrics.return_value = [mocked_metric]
+        mock_get_metrics.return_value = [metric]
         interim_df = DataFrame(
             {
                 "user_input": [user_question],
@@ -93,7 +94,12 @@ def test_run(
         )
         mock_generate_answers_from_model.return_value = interim_df.copy()
         mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
-        mock_ChatOpenAI.return_value = MagicMock()
+        mock_client = MagicMock()
+        mock_response = MagicMock()
+        mock_response.choices = [
+            MagicMock(message=MagicMock(content=student_model_response))
+        ]
+        mock_client.chat.completions.create.return_value = mock_response
 
         # Ragas requires this value to instantiate an EvaluationResult object, so we must provide it.
         # It isn't functionally used for our purposes though.
@@ -109,29 +115,20 @@ def test_run(
             )
         }
         mock_evaluate.return_value = EvaluationResult(
-            scores=[{mocked_metric: mocked_metric_score}],
+            scores=[{metric: metric_score}],
             dataset=mocked_evaluation_ds,
             ragas_traces=_unimportant_ragas_traces,
         )
 
-        ########################################################################
-        # Run the tests
-        ########################################################################
-
-        # Configure all other inputs that Ragas does not depend on for proper mocking
-        student_model = ModelConfig(
-            base_url="https://api.openai.com",
-            model_name="pt-3.5-turbo",
-            api_key="test-api-key",
-        )
-        run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)
-        evaluator = RagasEvaluator()
-
         ########################################################################
         # Test case: directly passing a dataset
         ########################################################################
+        evaluator = RagasEvaluator()
         result = evaluator.run(
-            dataset=base_ds, student_model=student_model, run_config=run_config
+            dataset=base_ds,
+            student_model=student_model,
+            run_config=run_config,
+            openai_client=mock_client,
         )
 
         self.assertIsInstance(result, EvaluationResult)
@@ -142,11 +139,13 @@ def test_run(
         ########################################################################
         # Test case: passing a dataset in via Path to JSONL file
         ########################################################################
+        evaluator = RagasEvaluator()
         mock_read_json.return_value = DataFrame(base_ds)
         result = evaluator.run(
             dataset=Path("dummy_path.jsonl"),
             student_model=student_model,
             run_config=run_config,
+            openai_client=mock_client,
         )
 
         self.assertIsInstance(result, EvaluationResult)
@@ -156,6 +155,24 @@ def test_run(
         mock_generate_answers_from_model.assert_called()
         mock_evaluate.assert_called()
 
+        ########################################################################
+        # Test case: using the instance attributes
+        ########################################################################
+        evaluator = RagasEvaluator(
+            student_model=student_model,
+            openai_client=mock_client,
+            run_config=run_config,
+        )
+        mock_read_json.return_value = DataFrame(base_ds)
+        result = evaluator.run(dataset=Path("dummy_path.jsonl"))
+
+        self.assertIsInstance(result, EvaluationResult)
+        mock_read_json.assert_called_with(
+            Path("dummy_path.jsonl"), orient="records", lines=True
+        )
+        mock_generate_answers_from_model.assert_called()
+        mock_evaluate.assert_called()
+
 
 if __name__ == "__main__":
     unittest.main()

From ab3d168d434f0098af1be8ebf6f4fa8ce2e4a40a Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Tue, 7 Jan 2025 16:07:43 -0500
Subject: [PATCH 6/6] chore: decouple tests into more atomic units

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/eval/ragas.py | 112 +++++++++++-----
 tests/test_ragas.py           | 238 ++++++++++++++++++++--------------
 2 files changed, 220 insertions(+), 130 deletions(-)

diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
index 9515a95..f0445da 100644
--- a/src/instructlab/eval/ragas.py
+++ b/src/instructlab/eval/ragas.py
@@ -1,13 +1,14 @@
 # # SPDX-License-Identifier: Apache-2.0
 # Standard
 from pathlib import Path
-from typing import List, Optional, TypedDict
+from typing import TYPE_CHECKING, List, Optional, TypedDict
 
 # Third Party
 from langchain_community.chat_models import ChatOpenAI
 from openai import Client as OpenAIClient
+from openai.types.chat import ChatCompletionMessageParam
 from pandas import DataFrame, read_json
-from pydantic import BaseModel, ConfigDict, field_validator
+from pydantic import BaseModel, ConfigDict, Field
 from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
 from ragas.metrics import Metric
 from ragas.metrics._domain_specific_rubrics import (  # the rubrics we must instantiate are located inside of a file marked as private
@@ -17,6 +18,9 @@
 
 # Local
 from .evaluator import Evaluator
+from .logger_config import setup_logger
+
+logger = setup_logger(__name__)
 
 
 class Sample(TypedDict):
@@ -56,7 +60,7 @@ class ModelConfig(BaseModel):
     system_prompt: str = _DEFAULT_SYSTEM_PROMPT
 
     # "model randomness" aka likelihood of sampling something other than the likeliest token
-    temperature: float = 0.0
+    temperature: float = Field(default=0.0, le=1.0, ge=0.0)
 
     # Max amount of tokens to generate.
     max_tokens: int = 768
@@ -64,13 +68,6 @@ class ModelConfig(BaseModel):
     # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes.
     seed: int = DEFAULT_SEED
 
-    @field_validator("temperature")
-    @classmethod
-    def check_temperature(cls, v: float) -> float:
-        if not 0.0 <= v <= 1.0:
-            raise ValueError("temperature must be between 0.0 and 1.0")
-        return v
-
 
 class RagasEvaluator(Evaluator):
     # most basic implementation, we just assume that the user will bring the existing model responses
@@ -80,18 +77,42 @@ def __init__(
         self,
         student_model: ModelConfig | None = None,
         run_config: RunConfig | None = None,
-        openai_client: OpenAIClient | None = None,
+        student_openai_client: OpenAIClient | None = None,
+        judge_model_name: str = DEFAULT_JUDGE_MODEL,
+        judge_openai_api_key: str | None = None,
     ):
         self.student_model = student_model
         self.run_config = run_config
-        self.openai_client = openai_client
+        self.student_openai_client = student_openai_client
+        self.judge_model_name = judge_model_name
+        self.judge_openai_api_key = judge_openai_api_key
+
+    @staticmethod
+    def _validate_dataset(df: DataFrame):
+        """
+        Validates whether or not the given `df` is a valid dataset of `Sample` objects.
+
+        Args:
+            df (DataFrame): DataFrame containing the dataset to be evaluated.
+        """
+        # We have to hardcode these fields because the automated way of resolving the required fields from a TypedDict
+        # is only included by default in Python3.11+. For earlier versions, the `typing_extensions` package is required.
+        # See: https://docs.python.org/3/whatsnew/3.11.html#pep-655-marking-individual-typeddict-items-as-required-or-not-required
+        required_keys = {"user_input", "reference"}
+        missing_keys = required_keys - set(df.columns)
+        if missing_keys:
+            raise ValueError(
+                f"invalid dataset provided, missing the following keys: {', '.join(missing_keys)}"
+            )
 
     def run(
         self,
         dataset: List[Sample] | Path,
         student_model: ModelConfig | None = None,
         run_config: RunConfig | None = None,
-        openai_client: OpenAIClient | None = None,
+        student_openai_client: OpenAIClient | None = None,
+        judge_model_name: str | None = None,
+        judge_openai_api_key: str | None = None,
     ) -> EvaluationResult:
         """
         Evaluates the quality of model responses against a graded rubric.
@@ -111,21 +132,31 @@ def run(
                 a default one is created containing extremely permissive settings when handling
                 timeouts. This is because by default, OpenAI tier-1 usage accounts have very high
                 rate limits resulting in heavy throttling during evaluations.
-            openai_client (openai.Client | None, optional):
+            student_openai_client (openai.Client | None, optional):
                 The client to use when generating questions from the student model, must be compatible with the OpenAI API.
                 This field is required when `student_model` is provided.
+            judge_model_name (str | None, optional):
+                Name of the OpenAI model to use as the judge model. Defaults to "gpt-4o" when none is specified.
+            judge_openai_api_key (str | None, optional):
+                The API key to use for evaluating the given dataset. When this isn't provided, `OPENAI_API_KEY` is read instead.
+
 
         Returns:
             EvaluationResult: The results of all evaluations performed by Ragas
         """
+        judge_model_name = (
+            judge_model_name if judge_model_name else self.judge_model_name
+        )
+        judge_openai_api_key = (
+            judge_openai_api_key if judge_openai_api_key else self.judge_openai_api_key
+        )
         student_model = student_model if student_model else self.student_model
         run_config = run_config if run_config else self.run_config
-        openai_client = openai_client if openai_client else self.openai_client
-
-        if not dataset:
-            raise ValueError(
-                "no dataset was provided, please specify the `dataset` argument"
-            )
+        student_openai_client = (
+            student_openai_client
+            if student_openai_client
+            else self.student_openai_client
+        )
 
         # ensure we are in the dataframe format
         input_df = None
@@ -137,22 +168,30 @@ def run(
             raise TypeError(f"invalid type of dataset: {type(dataset)}")
 
         # this should never happen, but pylint is not smart enough to detect it
-        assert input_df is not None
+        if TYPE_CHECKING:
+            assert input_df is not None
+
+        # ensure the dataset is in the format we expect it
+        self._validate_dataset(input_df)
 
         need_to_generate_questions = "response" not in input_df.columns
-        if need_to_generate_questions and (not student_model or not openai_client):
-            raise ValueError(
-                "provided dataset doesn't contain the model `response`, but either `student_model` or `openai_client` wasn't provided for inference"
+        if need_to_generate_questions:
+            logger.debug(
+                "`response` is missing in the input dataframe columns, generating questions from the model is required."
             )
+            if not student_model or not student_openai_client:
+                raise ValueError(
+                    "provided dataset doesn't contain the model `response`, but either `student_model` or `student_openai_client` wasn't provided for inference"
+                )
 
         # if the student model was provided then we always generate regardless
         if student_model:
-            if not openai_client:
+            if not student_openai_client:
                 raise ValueError(
-                    "`student_model` was specified but `openai_client` was not provided"
+                    "`student_model` was specified but `student_openai_client` was not provided"
                 )
             input_df = self._generate_answers_from_model(
-                input_df, student_model, openai_client
+                input_df, student_model, student_openai_client
             )
 
         if not run_config:
@@ -170,7 +209,8 @@ def run(
 
         # we will be using gpt-4o for the foreseeable future, we hardcode this
         # for consistency of answers
-        critic_lm = ChatOpenAI(model=DEFAULT_JUDGE_MODEL)
+
+        critic_lm = ChatOpenAI(model=judge_model_name, api_key=judge_openai_api_key)
         results = evaluate(
             dataset=evaluation_ds,
             batch_size=4,
@@ -185,7 +225,7 @@ def _generate_answers_from_model(
         self,
         questions: DataFrame,
         student_model: ModelConfig,
-        openai_client: OpenAIClient,
+        student_openai_client: OpenAIClient,
     ) -> DataFrame:
         """
         Given a DataFrame containing `user_input` columns, generates responses from the given model
@@ -196,11 +236,14 @@ def _generate_answers_from_model(
         updated_df["response"] = ""
 
         for i, qna in updated_df.iterrows():
-            messages = [
-                student_model.system_prompt,
-                qna["user_input"],
+            messages: List[ChatCompletionMessageParam] = [
+                {
+                    "role": "system",
+                    "content": student_model.system_prompt,
+                },
+                {"role": "user", "content": qna["user_input"]},
             ]
-            response = openai_client.chat.completions.create(
+            response = student_openai_client.chat.completions.create(
                 messages=messages,
                 model=student_model.model_name,
                 # specify the seed so we can at least try to have some reproducibility when the clients support it
@@ -211,7 +254,8 @@ def _generate_answers_from_model(
             updated_df.at[i, "response"] = response.choices[0].message.content
         return updated_df
 
-    def _get_metrics(self) -> List[Metric]:
+    @staticmethod
+    def _get_metrics() -> List[Metric]:
         # default set of metrics
         return [
             RubricsScore(
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
index ebabb2b..1d3bb8f 100644
--- a/tests/test_ragas.py
+++ b/tests/test_ragas.py
@@ -1,4 +1,4 @@
-# # SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: Apache-2.0
 # Standard
 from pathlib import Path
 from unittest.mock import MagicMock, patch
@@ -8,102 +8,55 @@
 from pandas import DataFrame
 from ragas.callbacks import ChainRun
 from ragas.dataset_schema import EvaluationDataset, EvaluationResult
-import pandas as pd
 
 # First Party
 from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig
 
 
 class TestRagasEvaluator(unittest.TestCase):
-    def test_generate_answers_from_model(self):
-        # mock the OpenAI client to always return "london" for chat completions
-        user_input = "What is the capital of France?"
-        model_response = "London"
-        mock_client = MagicMock()
-        mock_response = MagicMock()
-        mock_response.choices = [MagicMock(message=MagicMock(content=model_response))]
-        mock_client.chat.completions.create.return_value = mock_response
-
-        # get answers
-        questions = pd.DataFrame({"user_input": [user_input]})
-        student_model = ModelConfig(
+    def setUp(self):
+        # Common setup data for all tests
+        self.student_model_response = "Paris"
+        self.user_question = "What is the capital of France?"
+        self.golden_answer = "The capital of France is Paris."
+        self.metric = "mocked-metric"
+        self.metric_score = 4.0
+        self.base_ds = [
+            {
+                "user_input": self.user_question,
+                "reference": self.golden_answer,
+            }
+        ]
+        self.student_model = ModelConfig(
             model_name="super-jeeves-8x700B",
         )
-        evaluator = RagasEvaluator()
-        result_df = evaluator._generate_answers_from_model(
-            questions, student_model, mock_client
-        )
-
-        # what we expect to see
-        expected_df = questions.copy()
-        expected_df["response"] = [model_response]
-
-        # perform the assertions
-        pd.testing.assert_frame_equal(result_df, expected_df)
-        mock_client.chat.completions.create.assert_called_once_with(
-            messages=[student_model.system_prompt, user_input],
-            model=student_model.model_name,
-            seed=42,
-            max_tokens=student_model.max_tokens,
-            temperature=student_model.temperature,
-        )
+        self.run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)
 
     @patch("instructlab.eval.ragas.ChatOpenAI")
-    @patch("instructlab.eval.ragas.read_json")
     @patch("instructlab.eval.ragas.evaluate")
     @patch.object(RagasEvaluator, "_generate_answers_from_model")
     @patch.object(RagasEvaluator, "_get_metrics")
-    def test_run(
+    def test_run_with_dataset(
         self,
         mock_get_metrics: MagicMock,
         mock_generate_answers_from_model: MagicMock,
         mock_evaluate: MagicMock,
-        mock_read_json: MagicMock,
         mock_ChatOpenAI: MagicMock,
     ):
-        ########################################################################
-        # SETUP EVERYTHING WE NEED FOR THE TESTS
-        ########################################################################
-
-        # These are the variables which will control the flow of the test.
-        # Since we have to re-construct some Ragas components under the hood,
-
-        student_model_response = "Paris"
-        user_question = "What is the capital of France?"
-        golden_answer = "The capital of France is Paris."
-        metric = "mocked-metric"
-        metric_score = 4.0
-        base_ds = [{"user_input": user_question, "reference": golden_answer}]
-        student_model = ModelConfig(
-            model_name="super-jeeves-8x700B",
-        )
-        run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)
-
-        # The following section takes care of mocking function return calls.
-        # Ragas is tricky because it has some complex data structures under the hood,
-        # so what we have to do is configure the intermediate outputs that we expect
-        # to receive from Ragas.
-
-        mock_get_metrics.return_value = [metric]
+        """
+        Test case 1: Directly passing a Python list/dict dataset to `RagasEvaluator.run()`.
+        """
+        # Prepare mocks
+        mock_get_metrics.return_value = [self.metric]
         interim_df = DataFrame(
             {
-                "user_input": [user_question],
-                "response": [student_model_response],
-                "reference": [golden_answer],
+                "user_input": [self.user_question],
+                "response": [self.student_model_response],
+                "reference": [self.golden_answer],
             }
         )
-        mock_generate_answers_from_model.return_value = interim_df.copy()
+        mock_generate_answers_from_model.return_value = interim_df
         mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
-        mock_client = MagicMock()
-        mock_response = MagicMock()
-        mock_response.choices = [
-            MagicMock(message=MagicMock(content=student_model_response))
-        ]
-        mock_client.chat.completions.create.return_value = mock_response
-
-        # Ragas requires this value to instantiate an EvaluationResult object, so we must provide it.
-        # It isn't functionally used for our purposes though.
-
         _unimportant_ragas_traces = {
             "default": ChainRun(
                 run_id="42",
@@ -115,39 +68,86 @@ def test_run(
             )
         }
         mock_evaluate.return_value = EvaluationResult(
-            scores=[{metric: metric_score}],
+            scores=[{self.metric: self.metric_score}],
             dataset=mocked_evaluation_ds,
             ragas_traces=_unimportant_ragas_traces,
         )
 
-        ########################################################################
-        # Test case: directly passing a dataset
-        ########################################################################
+        # Instantiate evaluator
         evaluator = RagasEvaluator()
+
+        # Run test
         result = evaluator.run(
-            dataset=base_ds,
-            student_model=student_model,
-            run_config=run_config,
-            openai_client=mock_client,
+            dataset=self.base_ds,
+            student_model=self.student_model,
+            run_config=self.run_config,
+            student_openai_client=MagicMock(),  # We pass a mock client
         )
 
+        # Assertions
         self.assertIsInstance(result, EvaluationResult)
         mock_generate_answers_from_model.assert_called_once()
         mock_evaluate.assert_called_once()
-        mock_ChatOpenAI.assert_called_once_with(model="gpt-4o")
+        # we didn't provide an API key, so it expects to get `api_key=None`
+        mock_ChatOpenAI.assert_called_once_with(model="gpt-4o", api_key=None)
 
-        ########################################################################
-        # Test case: passing a dataset in via Path to JSONL file
-        ########################################################################
+    @patch("instructlab.eval.ragas.ChatOpenAI")
+    @patch("instructlab.eval.ragas.read_json")
+    @patch("instructlab.eval.ragas.evaluate")
+    @patch.object(RagasEvaluator, "_generate_answers_from_model")
+    @patch.object(RagasEvaluator, "_get_metrics")
+    def test_run_with_dataset_via_path(
+        self,
+        mock_get_metrics: MagicMock,
+        mock_generate_answers_from_model: MagicMock,
+        mock_evaluate: MagicMock,
+        mock_read_json: MagicMock,
+        mock_ChatOpenAI: MagicMock,
+    ):
+        """
+        Test case 2: Passing a Path to a JSONL file (containing the dataset) to `RagasEvaluator.run()`.
+        """
+        # Prepare mocks
+        mock_get_metrics.return_value = [self.metric]
+        interim_df = DataFrame(
+            {
+                "user_input": [self.user_question],
+                "response": [self.student_model_response],
+                "reference": [self.golden_answer],
+            }
+        )
+        mock_generate_answers_from_model.return_value = interim_df
+        mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
+        _unimportant_ragas_traces = {
+            "default": ChainRun(
+                run_id="42",
+                parent_run_id=None,
+                name="root",
+                inputs={"system": "null", "user": "null"},
+                outputs={"assistant": "null"},
+                metadata={"user_id": 1337},
+            )
+        }
+        mock_evaluate.return_value = EvaluationResult(
+            scores=[{self.metric: self.metric_score}],
+            dataset=mocked_evaluation_ds,
+            ragas_traces=_unimportant_ragas_traces,
+        )
+
+        mock_read_json.return_value = DataFrame(self.base_ds)
+
+        # Instantiate evaluator
         evaluator = RagasEvaluator()
-        mock_read_json.return_value = DataFrame(base_ds)
+
+        # Run test
         result = evaluator.run(
             dataset=Path("dummy_path.jsonl"),
-            student_model=student_model,
-            run_config=run_config,
-            openai_client=mock_client,
+            student_model=self.student_model,
+            run_config=self.run_config,
+            student_openai_client=MagicMock(),
         )
 
+        # Assertions
         self.assertIsInstance(result, EvaluationResult)
         mock_read_json.assert_called_once_with(
             Path("dummy_path.jsonl"), orient="records", lines=True
@@ -155,17 +155,63 @@ def test_run(
         mock_generate_answers_from_model.assert_called()
         mock_evaluate.assert_called()
 
-        ########################################################################
-        # Test case: using the instance attributes
-        ########################################################################
+    @patch("instructlab.eval.ragas.ChatOpenAI")
+    @patch("instructlab.eval.ragas.read_json")
+    @patch("instructlab.eval.ragas.evaluate")
+    @patch.object(RagasEvaluator, "_generate_answers_from_model")
+    @patch.object(RagasEvaluator, "_get_metrics")
+    def test_run_with_instance_attributes(
+        self,
+        mock_get_metrics: MagicMock,
+        mock_generate_answers_from_model: MagicMock,
+        mock_evaluate: MagicMock,
+        mock_read_json: MagicMock,
+        mock_ChatOpenAI: MagicMock,
+    ):
+        """
+        Test case 3: Using `RagasEvaluator` instance attributes for `student_model`, `run_config`,
+                     and `student_openai_client` instead of passing them explicitly.
+        """
+        # Prepare mocks
+        mock_get_metrics.return_value = [self.metric]
+        interim_df = DataFrame(
+            {
+                "user_input": [self.user_question],
+                "response": [self.student_model_response],
+                "reference": [self.golden_answer],
+            }
+        )
+        mock_generate_answers_from_model.return_value = interim_df
+        mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
+        _unimportant_ragas_traces = {
+            "default": ChainRun(
+                run_id="42",
+                parent_run_id=None,
+                name="root",
+                inputs={"system": "null", "user": "null"},
+                outputs={"assistant": "null"},
+                metadata={"user_id": 1337},
+            )
+        }
+        mock_evaluate.return_value = EvaluationResult(
+            scores=[{self.metric: self.metric_score}],
+            dataset=mocked_evaluation_ds,
+            ragas_traces=_unimportant_ragas_traces,
+        )
+
+        mock_read_json.return_value = DataFrame(self.base_ds)
+
+        # Instantiate evaluator with instance-level configs
         evaluator = RagasEvaluator(
-            student_model=student_model,
-            openai_client=mock_client,
-            run_config=run_config,
+            student_model=self.student_model,
+            student_openai_client=MagicMock(),
+            run_config=self.run_config,
         )
-        mock_read_json.return_value = DataFrame(base_ds)
+
+        # Run test
         result = evaluator.run(dataset=Path("dummy_path.jsonl"))
 
+        # Assertions
         self.assertIsInstance(result, EvaluationResult)
         mock_read_json.assert_called_with(
             Path("dummy_path.jsonl"), orient="records", lines=True