Skip to content

Commit

Permalink
chore: add unit tests for ragas evaluator
Browse files Browse the repository at this point in the history
Signed-off-by: Oleg S <[email protected]>
  • Loading branch information
RobotSail committed Jan 2, 2025
1 parent 3a9e3f2 commit d581e7d
Show file tree
Hide file tree
Showing 2 changed files with 168 additions and 7 deletions.
14 changes: 7 additions & 7 deletions src/instructlab/eval/ragas.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# # SPDX-License-Identifier: Apache-2.0
# Standard
from pathlib import Path
from typing import List, Optional, TypedDict
Expand Down Expand Up @@ -53,7 +54,7 @@ class ModelConfig(BaseModel):

# name of the model to use.
model_name: str

# The system prompt to be used when applying the chat template.
system_prompt: str = _DEFAULT_SYSTEM_PROMPT

Expand All @@ -67,7 +68,7 @@ class ModelConfig(BaseModel):
# Max amount of tokens to generate.
max_tokens: int = 768

# Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes.
# Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes.
seed: int = DEFAULT_SEED

@field_validator("temperature")
Expand Down Expand Up @@ -126,15 +127,14 @@ def run(
"no dataset was provided, please specify the `dataset` argument"
)

if type(dataset) not in (list, Path):
raise TypeError(f"invalid type of dataset: {type(dataset)}")

# ensure we are in the dataframe format
input_df = None
if isinstance(dataset, list):
input_df = DataFrame(dataset)
elif isinstance(dataset, Path):
input_df = read_json(dataset, orient="records", lines=True)
else:
raise TypeError(f"invalid type of dataset: {type(dataset)}")

# this should never happen, but pylint is not smart enough to detect it
assert input_df is not None
Expand Down Expand Up @@ -192,8 +192,8 @@ def _generate_answers_from_model(

for i, qna in updated_df.iterrows():
messages = [
student_model.system_prompt,
qna["user_input"],
{"role": "system", "content": student_model.system_prompt},
{"role": "user", "content": qna["user_input"]},
]
response = client.chat.completions.create(
messages=messages,
Expand Down
161 changes: 161 additions & 0 deletions tests/test_ragas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# # SPDX-License-Identifier: Apache-2.0
# Standard
from pathlib import Path
from unittest.mock import MagicMock, patch
import unittest

# Third Party
from pandas import DataFrame
from ragas.callbacks import ChainRun
from ragas.dataset_schema import EvaluationDataset, EvaluationResult
import pandas as pd

# First Party
from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig, Sample


class TestRagasEvaluator(unittest.TestCase):
@patch("instructlab.eval.ragas.get_openai_client")
def test_generate_answers_from_model(self, mock_get_openai_client):
# mock the OpenAI client to always return "london" for chat completions
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.choices[0].message.content = "London"
mock_client.chat.completions.create.return_value = mock_response
mock_get_openai_client.return_value = mock_client

# get answers
questions = pd.DataFrame({"user_input": ["What is the capital of France?"]})
student_model = ModelConfig(
base_url="https://api.openai.com",
model_name="gpt-3.5-turbo",
api_key="test-api-key",
)
evaluator = RagasEvaluator()
result_df = evaluator._generate_answers_from_model(questions, student_model)

# what we expect to see
expected_df = questions.copy()
expected_df["response"] = ["London"]

# perform the assertions
pd.testing.assert_frame_equal(result_df, expected_df)
mock_get_openai_client.assert_called_once_with(
model_api_base=student_model.base_url, api_key=student_model.api_key
)
mock_client.chat.completions.create.assert_called_once_with(
messages=[student_model.system_prompt, "What is the capital of France?"],
model=student_model.model_name,
seed=42,
max_tokens=student_model.max_tokens,
temperature=student_model.temperature,
)

@patch("instructlab.eval.ragas.read_json")
@patch("instructlab.eval.ragas.evaluate")
@patch("instructlab.eval.ragas.ChatOpenAI")
@patch.object(RagasEvaluator, "_generate_answers_from_model")
@patch.object(RagasEvaluator, "_get_metrics")
def test_run(
self,
mock_get_metrics: MagicMock,
mock_generate_answers_from_model: MagicMock,
mock_ChatOpenAI: MagicMock,
mock_evaluate: MagicMock,
mock_read_json: MagicMock,
):
########################################################################
# SETUP EVERYTHING WE NEED FOR THE TESTS
########################################################################

# These are the variables which will control the flow of the test.
# Since we have to re-construct some Ragas components under the hood,

student_model_response = "Paris"
user_question = "What is the capital of France?"
golden_answer = "The capital of France is Paris."
base_ds = [{"user_input": user_question, "reference": golden_answer}]
mocked_metric = "mocked-metric"
mocked_metric_score = 4.0

# The following section takes care of mocking function return calls.
# Ragas is tricky because it has some complex data structures under the hood,
# so what we have to do is configure the intermediate outputs that we expect
# to receive from Ragas.

mock_get_metrics.return_value = [mocked_metric]
interim_df = DataFrame(
{
"user_input": [user_question],
"response": [student_model_response],
"reference": [golden_answer],
}
)
mock_generate_answers_from_model.return_value = interim_df.copy()
mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
mock_ChatOpenAI.return_value = MagicMock()

# Ragas requires this value to instantiate an EvaluationResult object, so we must provide it.
# It isn't functionally used for our purposes though.

_unimportant_ragas_traces = {
"default": ChainRun(
run_id="42",
parent_run_id=None,
name="root",
inputs={"system": "null", "user": "null"},
outputs={"assistant": "null"},
metadata={"user_id": 1337},
)
}
mock_evaluate.return_value = EvaluationResult(
scores=[{mocked_metric: mocked_metric_score}],
dataset=mocked_evaluation_ds,
ragas_traces=_unimportant_ragas_traces,
)

########################################################################
# Run the tests
########################################################################

# Configure all other inputs that Ragas does not depend on for proper mocking
student_model = ModelConfig(
base_url="https://api.openai.com",
model_name="pt-3.5-turbo",
api_key="test-api-key",
)
run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)
evaluator = RagasEvaluator()

########################################################################
# Test case: directly passing a dataset
########################################################################
result = evaluator.run(
dataset=base_ds, student_model=student_model, run_config=run_config
)

self.assertIsInstance(result, EvaluationResult)
mock_generate_answers_from_model.assert_called_once()
mock_evaluate.assert_called_once()
mock_ChatOpenAI.assert_called_once_with(model="gpt-4o")

########################################################################
# Test case: passing a dataset in via Path to JSONL file
########################################################################
mock_read_json.return_value = DataFrame(base_ds)
result = evaluator.run(
dataset=Path("dummy_path.jsonl"),
student_model=student_model,
run_config=run_config,
)

self.assertIsInstance(result, EvaluationResult)
mock_read_json.assert_called_once_with(
Path("dummy_path.jsonl"), orient="records", lines=True
)
mock_generate_answers_from_model.assert_called()
mock_evaluate.assert_called()


if __name__ == "__main__":
unittest.main()

0 comments on commit d581e7d

Please sign in to comment.