diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py index c7ac148..9515a95 100644 --- a/src/instructlab/eval/ragas.py +++ b/src/instructlab/eval/ragas.py @@ -5,6 +5,7 @@ # Third Party from langchain_community.chat_models import ChatOpenAI +from openai import Client as OpenAIClient from pandas import DataFrame, read_json from pydantic import BaseModel, ConfigDict, field_validator from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate @@ -16,7 +17,6 @@ # Local from .evaluator import Evaluator -from .mt_bench_common import get_openai_client class Sample(TypedDict): @@ -49,19 +49,12 @@ class Sample(TypedDict): class ModelConfig(BaseModel): model_config = ConfigDict(protected_namespaces=()) - # URL of the OpenAI server where the model shall be hosted. - base_url: str - # name of the model to use. model_name: str # The system prompt to be used when applying the chat template. system_prompt: str = _DEFAULT_SYSTEM_PROMPT - # We do NOT read from OPENAI_API_KEY for the student model for security reasons (e.g. sending the API key to another client) - # To provide an OpenAI key, you must set it here; else the default is used. - api_key: str = "no-api-key" - # "model randomness" aka likelihood of sampling something other than the likeliest token temperature: float = 0.0 @@ -87,15 +80,18 @@ def __init__( self, student_model: ModelConfig | None = None, run_config: RunConfig | None = None, + openai_client: OpenAIClient | None = None, ): self.student_model = student_model self.run_config = run_config + self.openai_client = openai_client def run( self, dataset: List[Sample] | Path, student_model: ModelConfig | None = None, run_config: RunConfig | None = None, + openai_client: OpenAIClient | None = None, ) -> EvaluationResult: """ Evaluates the quality of model responses against a graded rubric. @@ -115,12 +111,16 @@ def run( a default one is created containing extremely permissive settings when handling timeouts. This is because by default, OpenAI tier-1 usage accounts have very high rate limits resulting in heavy throttling during evaluations. + openai_client (openai.Client | None, optional): + The client to use when generating questions from the student model, must be compatible with the OpenAI API. + This field is required when `student_model` is provided. Returns: EvaluationResult: The results of all evaluations performed by Ragas """ student_model = student_model if student_model else self.student_model run_config = run_config if run_config else self.run_config + openai_client = openai_client if openai_client else self.openai_client if not dataset: raise ValueError( @@ -140,14 +140,20 @@ def run( assert input_df is not None need_to_generate_questions = "response" not in input_df.columns - if need_to_generate_questions and not student_model: + if need_to_generate_questions and (not student_model or not openai_client): raise ValueError( - "provided dataset doesn't contain the model `response`, but no `student_model` was provided for inference" + "provided dataset doesn't contain the model `response`, but either `student_model` or `openai_client` wasn't provided for inference" ) # if the student model was provided then we always generate regardless if student_model: - input_df = self._generate_answers_from_model(input_df, student_model) + if not openai_client: + raise ValueError( + "`student_model` was specified but `openai_client` was not provided" + ) + input_df = self._generate_answers_from_model( + input_df, student_model, openai_client + ) if not run_config: # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits @@ -176,16 +182,15 @@ def run( return results def _generate_answers_from_model( - self, questions: DataFrame, student_model: ModelConfig + self, + questions: DataFrame, + student_model: ModelConfig, + openai_client: OpenAIClient, ) -> DataFrame: """ Given a DataFrame containing `user_input` columns, generates responses from the given model and returns a new DataFrame containing its answers in the `response` column. """ - client = get_openai_client( - model_api_base=student_model.base_url, api_key=student_model.api_key - ) - # initialize response to write into updated_df = questions.copy() updated_df["response"] = "" @@ -195,7 +200,7 @@ def _generate_answers_from_model( student_model.system_prompt, qna["user_input"], ] - response = client.chat.completions.create( + response = openai_client.chat.completions.create( messages=messages, model=student_model.model_name, # specify the seed so we can at least try to have some reproducibility when the clients support it diff --git a/tests/test_ragas.py b/tests/test_ragas.py index e2667a3..ebabb2b 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -11,58 +11,55 @@ import pandas as pd # First Party -from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig, Sample +from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig class TestRagasEvaluator(unittest.TestCase): - @patch("instructlab.eval.ragas.get_openai_client") - def test_generate_answers_from_model(self, mock_get_openai_client): + def test_generate_answers_from_model(self): # mock the OpenAI client to always return "london" for chat completions + user_input = "What is the capital of France?" + model_response = "London" mock_client = MagicMock() mock_response = MagicMock() - mock_response.choices[0].message.content = "London" + mock_response.choices = [MagicMock(message=MagicMock(content=model_response))] mock_client.chat.completions.create.return_value = mock_response - mock_get_openai_client.return_value = mock_client # get answers - questions = pd.DataFrame({"user_input": ["What is the capital of France?"]}) + questions = pd.DataFrame({"user_input": [user_input]}) student_model = ModelConfig( - base_url="https://your.model.endpoint.com", - model_name="jeeves-512B", - api_key="test-api-key", + model_name="super-jeeves-8x700B", ) evaluator = RagasEvaluator() - result_df = evaluator._generate_answers_from_model(questions, student_model) + result_df = evaluator._generate_answers_from_model( + questions, student_model, mock_client + ) # what we expect to see expected_df = questions.copy() - expected_df["response"] = ["London"] + expected_df["response"] = [model_response] # perform the assertions pd.testing.assert_frame_equal(result_df, expected_df) - mock_get_openai_client.assert_called_once_with( - model_api_base=student_model.base_url, api_key=student_model.api_key - ) mock_client.chat.completions.create.assert_called_once_with( - messages=[student_model.system_prompt, "What is the capital of France?"], + messages=[student_model.system_prompt, user_input], model=student_model.model_name, seed=42, max_tokens=student_model.max_tokens, temperature=student_model.temperature, ) + @patch("instructlab.eval.ragas.ChatOpenAI") @patch("instructlab.eval.ragas.read_json") @patch("instructlab.eval.ragas.evaluate") - @patch("instructlab.eval.ragas.ChatOpenAI") @patch.object(RagasEvaluator, "_generate_answers_from_model") @patch.object(RagasEvaluator, "_get_metrics") def test_run( self, mock_get_metrics: MagicMock, mock_generate_answers_from_model: MagicMock, - mock_ChatOpenAI: MagicMock, mock_evaluate: MagicMock, mock_read_json: MagicMock, + mock_ChatOpenAI: MagicMock, ): ######################################################################## # SETUP EVERYTHING WE NEED FOR THE TESTS @@ -74,16 +71,20 @@ def test_run( student_model_response = "Paris" user_question = "What is the capital of France?" golden_answer = "The capital of France is Paris." + metric = "mocked-metric" + metric_score = 4.0 base_ds = [{"user_input": user_question, "reference": golden_answer}] - mocked_metric = "mocked-metric" - mocked_metric_score = 4.0 + student_model = ModelConfig( + model_name="super-jeeves-8x700B", + ) + run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30) # The following section takes care of mocking function return calls. # Ragas is tricky because it has some complex data structures under the hood, # so what we have to do is configure the intermediate outputs that we expect # to receive from Ragas. - mock_get_metrics.return_value = [mocked_metric] + mock_get_metrics.return_value = [metric] interim_df = DataFrame( { "user_input": [user_question], @@ -93,7 +94,12 @@ def test_run( ) mock_generate_answers_from_model.return_value = interim_df.copy() mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) - mock_ChatOpenAI.return_value = MagicMock() + mock_client = MagicMock() + mock_response = MagicMock() + mock_response.choices = [ + MagicMock(message=MagicMock(content=student_model_response)) + ] + mock_client.chat.completions.create.return_value = mock_response # Ragas requires this value to instantiate an EvaluationResult object, so we must provide it. # It isn't functionally used for our purposes though. @@ -109,29 +115,20 @@ def test_run( ) } mock_evaluate.return_value = EvaluationResult( - scores=[{mocked_metric: mocked_metric_score}], + scores=[{metric: metric_score}], dataset=mocked_evaluation_ds, ragas_traces=_unimportant_ragas_traces, ) - ######################################################################## - # Run the tests - ######################################################################## - - # Configure all other inputs that Ragas does not depend on for proper mocking - student_model = ModelConfig( - base_url="https://api.openai.com", - model_name="pt-3.5-turbo", - api_key="test-api-key", - ) - run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30) - evaluator = RagasEvaluator() - ######################################################################## # Test case: directly passing a dataset ######################################################################## + evaluator = RagasEvaluator() result = evaluator.run( - dataset=base_ds, student_model=student_model, run_config=run_config + dataset=base_ds, + student_model=student_model, + run_config=run_config, + openai_client=mock_client, ) self.assertIsInstance(result, EvaluationResult) @@ -142,11 +139,13 @@ def test_run( ######################################################################## # Test case: passing a dataset in via Path to JSONL file ######################################################################## + evaluator = RagasEvaluator() mock_read_json.return_value = DataFrame(base_ds) result = evaluator.run( dataset=Path("dummy_path.jsonl"), student_model=student_model, run_config=run_config, + openai_client=mock_client, ) self.assertIsInstance(result, EvaluationResult) @@ -156,6 +155,24 @@ def test_run( mock_generate_answers_from_model.assert_called() mock_evaluate.assert_called() + ######################################################################## + # Test case: using the instance attributes + ######################################################################## + evaluator = RagasEvaluator( + student_model=student_model, + openai_client=mock_client, + run_config=run_config, + ) + mock_read_json.return_value = DataFrame(base_ds) + result = evaluator.run(dataset=Path("dummy_path.jsonl")) + + self.assertIsInstance(result, EvaluationResult) + mock_read_json.assert_called_with( + Path("dummy_path.jsonl"), orient="records", lines=True + ) + mock_generate_answers_from_model.assert_called() + mock_evaluate.assert_called() + if __name__ == "__main__": unittest.main()