From 3443ffacb8cf0183bb1b7ee0036f656afe6e167b Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:22:15 -0500 Subject: [PATCH 1/6] adds basic ragas eval Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- requirements.txt | 1 + src/instructlab/eval/ragas.py | 80 +++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 src/instructlab/eval/ragas.py diff --git a/requirements.txt b/requirements.txt index a3e6e7d..0853899 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ pandas pandas-stubs lm-eval>=0.4.4 httpx +ragas diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py new file mode 100644 index 0000000..3579289 --- /dev/null +++ b/src/instructlab/eval/ragas.py @@ -0,0 +1,80 @@ +# Standard +from typing import List, TypedDict + +# Third Party +from langchain_community.chat_models import ChatOpenAI +from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate +from ragas.metrics import RubricsScore +from ragas.metrics._domain_specific_rubrics import DEFAULT_WITH_REFERENCE_RUBRICS + +# Local +from .evaluator import Evaluator + + +class Sample(TypedDict): + # question + user_input: str + + # model answer + response: str + + # golden answer + reference: str + + +class RagasEvaluator(Evaluator): + # most basic implementation, we just assume that the user will bring the existing model responses + name = "ragas" + + def __init__(self): + pass + + def run( + self, dataset: List[Sample], run_config: RunConfig | None = None + ) -> EvaluationResult: + """ + Evaluates the quality of model responses against a graded rubric. + + Args: + dataset (List[Sample]): + List of model questions and answers + run_config (RunConfig | None, optional): + Configuration to use when running evaluations. If none is provided, then + a default one is created containing extremely permissive settings when handling + timeouts. This is because by default, OpenAI tier-1 usage accounts have very high + rate limits resulting in heavy throttling during evaluations. + + Returns: + EvaluationResult: The results of all evaluations performed by Ragas + """ + if not run_config: + # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits + # are horrible and will result in half of our evaluation results being NaN or 0 + run_config = RunConfig( + max_retries=120, + max_wait=7200, + seed=42, + timeout=3600, + ) + + # we will be using gpt-4o for the foreseeable future, we hardcode this + # for consistency of answers + input_ds = EvaluationDataset.from_list(dataset) + + # default set of metrics + metrics = [ + RubricsScore( + rubrics=DEFAULT_WITH_REFERENCE_RUBRICS, + ) + ] + + critic_lm = ChatOpenAI(model="gpt-4o") + results = evaluate( + dataset=input_ds, + batch_size=4, + run_config=run_config, + llm=critic_lm, + metrics=metrics, + show_progress=True, + ) + return results From 8568b139df0601b3feff563c0e85d3fecfda4c14 Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Fri, 6 Dec 2024 17:07:15 -0500 Subject: [PATCH 2/6] feat: add ability for ragas to read from a list We want ragas to read from both a list as well as a list of samples Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- src/instructlab/eval/ragas.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py index 3579289..91686d3 100644 --- a/src/instructlab/eval/ragas.py +++ b/src/instructlab/eval/ragas.py @@ -1,11 +1,15 @@ # Standard +from pathlib import Path from typing import List, TypedDict # Third Party from langchain_community.chat_models import ChatOpenAI from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate -from ragas.metrics import RubricsScore -from ragas.metrics._domain_specific_rubrics import DEFAULT_WITH_REFERENCE_RUBRICS +from ragas.metrics._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private + DEFAULT_WITH_REFERENCE_RUBRICS, + RubricsScore, +) +import pandas as pd # Local from .evaluator import Evaluator @@ -30,13 +34,13 @@ def __init__(self): pass def run( - self, dataset: List[Sample], run_config: RunConfig | None = None + self, dataset: List[Sample] | Path = None, run_config: RunConfig | None = None ) -> EvaluationResult: """ Evaluates the quality of model responses against a graded rubric. Args: - dataset (List[Sample]): + dataset (List[Sample] | Path): List of model questions and answers run_config (RunConfig | None, optional): Configuration to use when running evaluations. If none is provided, then @@ -47,6 +51,19 @@ def run( Returns: EvaluationResult: The results of all evaluations performed by Ragas """ + if not dataset: + raise ValueError( + "no dataset was provided, please specify the `dataset` argument" + ) + if isinstance(dataset, Path): + input_ds = EvaluationDataset.from_pandas( + pd.read_json(dataset, lines=True, orient="records") + ) + elif isinstance(dataset, list): + input_ds = EvaluationDataset.from_list(dataset) + else: + raise TypeError(f"invalid type passed for dataset: {type(dataset)}") + if not run_config: # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits # are horrible and will result in half of our evaluation results being NaN or 0 @@ -57,10 +74,6 @@ def run( timeout=3600, ) - # we will be using gpt-4o for the foreseeable future, we hardcode this - # for consistency of answers - input_ds = EvaluationDataset.from_list(dataset) - # default set of metrics metrics = [ RubricsScore( @@ -68,6 +81,8 @@ def run( ) ] + # we will be using gpt-4o for the foreseeable future, we hardcode this + # for consistency of answers critic_lm = ChatOpenAI(model="gpt-4o") results = evaluate( dataset=input_ds, From 58880c359025614ecab7cd209eeacf6968e7b465 Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Fri, 6 Dec 2024 22:20:58 -0500 Subject: [PATCH 3/6] feat: add ability for answers to be generated from user questions When a dataset is provided and is missing the `response` field, we will need to generate these responses. This commit ensures that when this case happens, we will error out when a student model is not configured. Otherwise, we will always generate these responses if the student model exists, regardless if `response` is in the dataframe or not. Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- src/instructlab/eval/ragas.py | 166 +++++++++++++++++++++++++++++----- 1 file changed, 143 insertions(+), 23 deletions(-) diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py index 91686d3..df7943c 100644 --- a/src/instructlab/eval/ragas.py +++ b/src/instructlab/eval/ragas.py @@ -1,47 +1,114 @@ # Standard from pathlib import Path -from typing import List, TypedDict +from typing import List, Optional, TypedDict # Third Party from langchain_community.chat_models import ChatOpenAI +from pandas import DataFrame, read_json +from pydantic import BaseModel, ConfigDict, field_validator from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate +from ragas.metrics import Metric from ragas.metrics._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private DEFAULT_WITH_REFERENCE_RUBRICS, RubricsScore, ) -import pandas as pd # Local from .evaluator import Evaluator +from .mt_bench_common import get_openai_client class Sample(TypedDict): + """ + TypedDict of a sample that we accept when doing eval with Ragas. + We specifically use TypedDict here to be flexible with the input data we accept. + """ + # question user_input: str # model answer - response: str + response: Optional[str] # golden answer reference: str +# default system prompt we'll use when none is provided. Make it private as we don't intend this to be a public object +_DEFAULT_SYSTEM_PROMPT = """You are an advanced AI assistant designed to provide precise and accurate information. +Your primary goal is to answer queries with the most up-to-date and factual information available. +Focus on delivering clear, concise, and correct responses. +If you're uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can. +Your responses should prioritize accuracy over all other considerations.""" + +DEFAULT_SEED = 1337 +DEFAULT_JUDGE_MODEL = "gpt-4o" + + +class ModelConfig(BaseModel): + model_config = ConfigDict(protected_namespaces=()) + + # URL of the OpenAI server where the model shall be hosted. + base_url: str + + # name of the model to use. + model_name: str + + # The system prompt to be used when applying the chat template. + system_prompt: str = _DEFAULT_SYSTEM_PROMPT + + # We do NOT read from OPENAI_API_KEY for the student model for security reasons (e.g. sending the API key to another client) + # To provide an OpenAI key, you must set it here; else the default is used. + api_key: str = "no-api-key" + + # "model randomness" aka likelihood of sampling something other than the likeliest token + temperature: float = 0.0 + + # Max amount of tokens to generate. + max_tokens: int = 768 + + # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes. + seed: int = DEFAULT_SEED + + @field_validator("temperature") + @classmethod + def check_temperature(cls, v: float) -> float: + if not 0.0 <= v <= 1.0: + raise ValueError("temperature must be between 0.0 and 1.0") + return v + + class RagasEvaluator(Evaluator): # most basic implementation, we just assume that the user will bring the existing model responses name = "ragas" - def __init__(self): - pass + def __init__( + self, + student_model: ModelConfig | None = None, + run_config: RunConfig | None = None, + ): + self.student_model = student_model + self.run_config = run_config def run( - self, dataset: List[Sample] | Path = None, run_config: RunConfig | None = None + self, + dataset: List[Sample] | Path, + student_model: ModelConfig | None = None, + run_config: RunConfig | None = None, ) -> EvaluationResult: """ Evaluates the quality of model responses against a graded rubric. + When the `dataset` lacks the `response` field, then `student_model` must be provided + in order to generate the answers. + Args: dataset (List[Sample] | Path): - List of model questions and answers + Can be either a list of `Sample` objects or a path to a jsonl file containing + records matching `Sample`. + student_model: (StudentModelConfig): + When this parameter is provided, we'll attempt to use the described model in order to + generate the responses from the given list of questions. run_config (RunConfig | None, optional): Configuration to use when running evaluations. If none is provided, then a default one is created containing extremely permissive settings when handling @@ -51,18 +118,36 @@ def run( Returns: EvaluationResult: The results of all evaluations performed by Ragas """ + student_model = student_model if student_model else self.student_model + run_config = run_config if run_config else self.run_config + if not dataset: raise ValueError( "no dataset was provided, please specify the `dataset` argument" ) - if isinstance(dataset, Path): - input_ds = EvaluationDataset.from_pandas( - pd.read_json(dataset, lines=True, orient="records") + + if type(dataset) not in (list, Path): + raise TypeError(f"invalid type of dataset: {type(dataset)}") + + # ensure we are in the dataframe format + input_df = None + if isinstance(dataset, list): + input_df = DataFrame(dataset) + elif isinstance(dataset, Path): + input_df = read_json(dataset, orient="records", lines=True) + + # this should never happen, but pylint is not smart enough to detect it + assert input_df is not None + + need_to_generate_questions = "response" not in input_df.columns + if need_to_generate_questions and not student_model: + raise ValueError( + "provided dataset doesn't contain the model `response`, but no `student_model` was provided for inference" ) - elif isinstance(dataset, list): - input_ds = EvaluationDataset.from_list(dataset) - else: - raise TypeError(f"invalid type passed for dataset: {type(dataset)}") + + # if the student model was provided then we always generate regardless + if student_model: + input_df = self._generate_answers_from_model(input_df, student_model) if not run_config: # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits @@ -70,22 +155,18 @@ def run( run_config = RunConfig( max_retries=120, max_wait=7200, - seed=42, + seed=DEFAULT_SEED, timeout=3600, ) - # default set of metrics - metrics = [ - RubricsScore( - rubrics=DEFAULT_WITH_REFERENCE_RUBRICS, - ) - ] + metrics = self._get_metrics() + evaluation_ds = EvaluationDataset.from_pandas(input_df) # we will be using gpt-4o for the foreseeable future, we hardcode this # for consistency of answers - critic_lm = ChatOpenAI(model="gpt-4o") + critic_lm = ChatOpenAI(model=DEFAULT_JUDGE_MODEL) results = evaluate( - dataset=input_ds, + dataset=evaluation_ds, batch_size=4, run_config=run_config, llm=critic_lm, @@ -93,3 +174,42 @@ def run( show_progress=True, ) return results + + def _generate_answers_from_model( + self, questions: DataFrame, student_model: ModelConfig + ) -> DataFrame: + """ + Given a DataFrame containing `user_input` columns, generates responses from the given model + and returns a new DataFrame containing its answers in the `response` column. + """ + client = get_openai_client( + model_api_base=student_model.base_url, api_key=student_model.api_key + ) + + # initialize response to write into + updated_df = questions.copy() + updated_df["response"] = "" + + for i, qna in updated_df.iterrows(): + messages = [ + student_model.system_prompt, + qna["user_input"], + ] + response = client.chat.completions.create( + messages=messages, + model=student_model.model_name, + # specify the seed so we can at least try to have some reproducibility when the clients support it + seed=42, + max_tokens=student_model.max_tokens, + temperature=student_model.temperature, + ) + updated_df.at[i, "response"] = response.choices[0].message.content + return updated_df + + def _get_metrics(self) -> List[Metric]: + # default set of metrics + return [ + RubricsScore( + rubrics=DEFAULT_WITH_REFERENCE_RUBRICS, + ) + ] From 04117dd3487a3934fef189348c0825a3b8458b68 Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:04:37 -0500 Subject: [PATCH 4/6] chore: add unit tests for ragas evaluator Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- src/instructlab/eval/ragas.py | 10 +-- tests/test_ragas.py | 161 ++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 5 deletions(-) create mode 100644 tests/test_ragas.py diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py index df7943c..c7ac148 100644 --- a/src/instructlab/eval/ragas.py +++ b/src/instructlab/eval/ragas.py @@ -1,3 +1,4 @@ +# # SPDX-License-Identifier: Apache-2.0 # Standard from pathlib import Path from typing import List, Optional, TypedDict @@ -53,7 +54,7 @@ class ModelConfig(BaseModel): # name of the model to use. model_name: str - + # The system prompt to be used when applying the chat template. system_prompt: str = _DEFAULT_SYSTEM_PROMPT @@ -67,7 +68,7 @@ class ModelConfig(BaseModel): # Max amount of tokens to generate. max_tokens: int = 768 - # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes. + # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes. seed: int = DEFAULT_SEED @field_validator("temperature") @@ -126,15 +127,14 @@ def run( "no dataset was provided, please specify the `dataset` argument" ) - if type(dataset) not in (list, Path): - raise TypeError(f"invalid type of dataset: {type(dataset)}") - # ensure we are in the dataframe format input_df = None if isinstance(dataset, list): input_df = DataFrame(dataset) elif isinstance(dataset, Path): input_df = read_json(dataset, orient="records", lines=True) + else: + raise TypeError(f"invalid type of dataset: {type(dataset)}") # this should never happen, but pylint is not smart enough to detect it assert input_df is not None diff --git a/tests/test_ragas.py b/tests/test_ragas.py new file mode 100644 index 0000000..e2667a3 --- /dev/null +++ b/tests/test_ragas.py @@ -0,0 +1,161 @@ +# # SPDX-License-Identifier: Apache-2.0 +# Standard +from pathlib import Path +from unittest.mock import MagicMock, patch +import unittest + +# Third Party +from pandas import DataFrame +from ragas.callbacks import ChainRun +from ragas.dataset_schema import EvaluationDataset, EvaluationResult +import pandas as pd + +# First Party +from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig, Sample + + +class TestRagasEvaluator(unittest.TestCase): + @patch("instructlab.eval.ragas.get_openai_client") + def test_generate_answers_from_model(self, mock_get_openai_client): + # mock the OpenAI client to always return "london" for chat completions + mock_client = MagicMock() + mock_response = MagicMock() + mock_response.choices[0].message.content = "London" + mock_client.chat.completions.create.return_value = mock_response + mock_get_openai_client.return_value = mock_client + + # get answers + questions = pd.DataFrame({"user_input": ["What is the capital of France?"]}) + student_model = ModelConfig( + base_url="https://your.model.endpoint.com", + model_name="jeeves-512B", + api_key="test-api-key", + ) + evaluator = RagasEvaluator() + result_df = evaluator._generate_answers_from_model(questions, student_model) + + # what we expect to see + expected_df = questions.copy() + expected_df["response"] = ["London"] + + # perform the assertions + pd.testing.assert_frame_equal(result_df, expected_df) + mock_get_openai_client.assert_called_once_with( + model_api_base=student_model.base_url, api_key=student_model.api_key + ) + mock_client.chat.completions.create.assert_called_once_with( + messages=[student_model.system_prompt, "What is the capital of France?"], + model=student_model.model_name, + seed=42, + max_tokens=student_model.max_tokens, + temperature=student_model.temperature, + ) + + @patch("instructlab.eval.ragas.read_json") + @patch("instructlab.eval.ragas.evaluate") + @patch("instructlab.eval.ragas.ChatOpenAI") + @patch.object(RagasEvaluator, "_generate_answers_from_model") + @patch.object(RagasEvaluator, "_get_metrics") + def test_run( + self, + mock_get_metrics: MagicMock, + mock_generate_answers_from_model: MagicMock, + mock_ChatOpenAI: MagicMock, + mock_evaluate: MagicMock, + mock_read_json: MagicMock, + ): + ######################################################################## + # SETUP EVERYTHING WE NEED FOR THE TESTS + ######################################################################## + + # These are the variables which will control the flow of the test. + # Since we have to re-construct some Ragas components under the hood, + + student_model_response = "Paris" + user_question = "What is the capital of France?" + golden_answer = "The capital of France is Paris." + base_ds = [{"user_input": user_question, "reference": golden_answer}] + mocked_metric = "mocked-metric" + mocked_metric_score = 4.0 + + # The following section takes care of mocking function return calls. + # Ragas is tricky because it has some complex data structures under the hood, + # so what we have to do is configure the intermediate outputs that we expect + # to receive from Ragas. + + mock_get_metrics.return_value = [mocked_metric] + interim_df = DataFrame( + { + "user_input": [user_question], + "response": [student_model_response], + "reference": [golden_answer], + } + ) + mock_generate_answers_from_model.return_value = interim_df.copy() + mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) + mock_ChatOpenAI.return_value = MagicMock() + + # Ragas requires this value to instantiate an EvaluationResult object, so we must provide it. + # It isn't functionally used for our purposes though. + + _unimportant_ragas_traces = { + "default": ChainRun( + run_id="42", + parent_run_id=None, + name="root", + inputs={"system": "null", "user": "null"}, + outputs={"assistant": "null"}, + metadata={"user_id": 1337}, + ) + } + mock_evaluate.return_value = EvaluationResult( + scores=[{mocked_metric: mocked_metric_score}], + dataset=mocked_evaluation_ds, + ragas_traces=_unimportant_ragas_traces, + ) + + ######################################################################## + # Run the tests + ######################################################################## + + # Configure all other inputs that Ragas does not depend on for proper mocking + student_model = ModelConfig( + base_url="https://api.openai.com", + model_name="pt-3.5-turbo", + api_key="test-api-key", + ) + run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30) + evaluator = RagasEvaluator() + + ######################################################################## + # Test case: directly passing a dataset + ######################################################################## + result = evaluator.run( + dataset=base_ds, student_model=student_model, run_config=run_config + ) + + self.assertIsInstance(result, EvaluationResult) + mock_generate_answers_from_model.assert_called_once() + mock_evaluate.assert_called_once() + mock_ChatOpenAI.assert_called_once_with(model="gpt-4o") + + ######################################################################## + # Test case: passing a dataset in via Path to JSONL file + ######################################################################## + mock_read_json.return_value = DataFrame(base_ds) + result = evaluator.run( + dataset=Path("dummy_path.jsonl"), + student_model=student_model, + run_config=run_config, + ) + + self.assertIsInstance(result, EvaluationResult) + mock_read_json.assert_called_once_with( + Path("dummy_path.jsonl"), orient="records", lines=True + ) + mock_generate_answers_from_model.assert_called() + mock_evaluate.assert_called() + + +if __name__ == "__main__": + unittest.main() From c6b5a70cbb5804f3f145782e0f84d88ea28e774b Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Fri, 13 Dec 2024 21:54:08 +0000 Subject: [PATCH 5/6] feat: update the Ragas evaluator to have the OpenAI client as something that gets passed in to __init__ Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- src/instructlab/eval/ragas.py | 39 ++++++++------- tests/test_ragas.py | 89 +++++++++++++++++++++-------------- 2 files changed, 75 insertions(+), 53 deletions(-) diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py index c7ac148..9515a95 100644 --- a/src/instructlab/eval/ragas.py +++ b/src/instructlab/eval/ragas.py @@ -5,6 +5,7 @@ # Third Party from langchain_community.chat_models import ChatOpenAI +from openai import Client as OpenAIClient from pandas import DataFrame, read_json from pydantic import BaseModel, ConfigDict, field_validator from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate @@ -16,7 +17,6 @@ # Local from .evaluator import Evaluator -from .mt_bench_common import get_openai_client class Sample(TypedDict): @@ -49,19 +49,12 @@ class Sample(TypedDict): class ModelConfig(BaseModel): model_config = ConfigDict(protected_namespaces=()) - # URL of the OpenAI server where the model shall be hosted. - base_url: str - # name of the model to use. model_name: str # The system prompt to be used when applying the chat template. system_prompt: str = _DEFAULT_SYSTEM_PROMPT - # We do NOT read from OPENAI_API_KEY for the student model for security reasons (e.g. sending the API key to another client) - # To provide an OpenAI key, you must set it here; else the default is used. - api_key: str = "no-api-key" - # "model randomness" aka likelihood of sampling something other than the likeliest token temperature: float = 0.0 @@ -87,15 +80,18 @@ def __init__( self, student_model: ModelConfig | None = None, run_config: RunConfig | None = None, + openai_client: OpenAIClient | None = None, ): self.student_model = student_model self.run_config = run_config + self.openai_client = openai_client def run( self, dataset: List[Sample] | Path, student_model: ModelConfig | None = None, run_config: RunConfig | None = None, + openai_client: OpenAIClient | None = None, ) -> EvaluationResult: """ Evaluates the quality of model responses against a graded rubric. @@ -115,12 +111,16 @@ def run( a default one is created containing extremely permissive settings when handling timeouts. This is because by default, OpenAI tier-1 usage accounts have very high rate limits resulting in heavy throttling during evaluations. + openai_client (openai.Client | None, optional): + The client to use when generating questions from the student model, must be compatible with the OpenAI API. + This field is required when `student_model` is provided. Returns: EvaluationResult: The results of all evaluations performed by Ragas """ student_model = student_model if student_model else self.student_model run_config = run_config if run_config else self.run_config + openai_client = openai_client if openai_client else self.openai_client if not dataset: raise ValueError( @@ -140,14 +140,20 @@ def run( assert input_df is not None need_to_generate_questions = "response" not in input_df.columns - if need_to_generate_questions and not student_model: + if need_to_generate_questions and (not student_model or not openai_client): raise ValueError( - "provided dataset doesn't contain the model `response`, but no `student_model` was provided for inference" + "provided dataset doesn't contain the model `response`, but either `student_model` or `openai_client` wasn't provided for inference" ) # if the student model was provided then we always generate regardless if student_model: - input_df = self._generate_answers_from_model(input_df, student_model) + if not openai_client: + raise ValueError( + "`student_model` was specified but `openai_client` was not provided" + ) + input_df = self._generate_answers_from_model( + input_df, student_model, openai_client + ) if not run_config: # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits @@ -176,16 +182,15 @@ def run( return results def _generate_answers_from_model( - self, questions: DataFrame, student_model: ModelConfig + self, + questions: DataFrame, + student_model: ModelConfig, + openai_client: OpenAIClient, ) -> DataFrame: """ Given a DataFrame containing `user_input` columns, generates responses from the given model and returns a new DataFrame containing its answers in the `response` column. """ - client = get_openai_client( - model_api_base=student_model.base_url, api_key=student_model.api_key - ) - # initialize response to write into updated_df = questions.copy() updated_df["response"] = "" @@ -195,7 +200,7 @@ def _generate_answers_from_model( student_model.system_prompt, qna["user_input"], ] - response = client.chat.completions.create( + response = openai_client.chat.completions.create( messages=messages, model=student_model.model_name, # specify the seed so we can at least try to have some reproducibility when the clients support it diff --git a/tests/test_ragas.py b/tests/test_ragas.py index e2667a3..ebabb2b 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -11,58 +11,55 @@ import pandas as pd # First Party -from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig, Sample +from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig class TestRagasEvaluator(unittest.TestCase): - @patch("instructlab.eval.ragas.get_openai_client") - def test_generate_answers_from_model(self, mock_get_openai_client): + def test_generate_answers_from_model(self): # mock the OpenAI client to always return "london" for chat completions + user_input = "What is the capital of France?" + model_response = "London" mock_client = MagicMock() mock_response = MagicMock() - mock_response.choices[0].message.content = "London" + mock_response.choices = [MagicMock(message=MagicMock(content=model_response))] mock_client.chat.completions.create.return_value = mock_response - mock_get_openai_client.return_value = mock_client # get answers - questions = pd.DataFrame({"user_input": ["What is the capital of France?"]}) + questions = pd.DataFrame({"user_input": [user_input]}) student_model = ModelConfig( - base_url="https://your.model.endpoint.com", - model_name="jeeves-512B", - api_key="test-api-key", + model_name="super-jeeves-8x700B", ) evaluator = RagasEvaluator() - result_df = evaluator._generate_answers_from_model(questions, student_model) + result_df = evaluator._generate_answers_from_model( + questions, student_model, mock_client + ) # what we expect to see expected_df = questions.copy() - expected_df["response"] = ["London"] + expected_df["response"] = [model_response] # perform the assertions pd.testing.assert_frame_equal(result_df, expected_df) - mock_get_openai_client.assert_called_once_with( - model_api_base=student_model.base_url, api_key=student_model.api_key - ) mock_client.chat.completions.create.assert_called_once_with( - messages=[student_model.system_prompt, "What is the capital of France?"], + messages=[student_model.system_prompt, user_input], model=student_model.model_name, seed=42, max_tokens=student_model.max_tokens, temperature=student_model.temperature, ) + @patch("instructlab.eval.ragas.ChatOpenAI") @patch("instructlab.eval.ragas.read_json") @patch("instructlab.eval.ragas.evaluate") - @patch("instructlab.eval.ragas.ChatOpenAI") @patch.object(RagasEvaluator, "_generate_answers_from_model") @patch.object(RagasEvaluator, "_get_metrics") def test_run( self, mock_get_metrics: MagicMock, mock_generate_answers_from_model: MagicMock, - mock_ChatOpenAI: MagicMock, mock_evaluate: MagicMock, mock_read_json: MagicMock, + mock_ChatOpenAI: MagicMock, ): ######################################################################## # SETUP EVERYTHING WE NEED FOR THE TESTS @@ -74,16 +71,20 @@ def test_run( student_model_response = "Paris" user_question = "What is the capital of France?" golden_answer = "The capital of France is Paris." + metric = "mocked-metric" + metric_score = 4.0 base_ds = [{"user_input": user_question, "reference": golden_answer}] - mocked_metric = "mocked-metric" - mocked_metric_score = 4.0 + student_model = ModelConfig( + model_name="super-jeeves-8x700B", + ) + run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30) # The following section takes care of mocking function return calls. # Ragas is tricky because it has some complex data structures under the hood, # so what we have to do is configure the intermediate outputs that we expect # to receive from Ragas. - mock_get_metrics.return_value = [mocked_metric] + mock_get_metrics.return_value = [metric] interim_df = DataFrame( { "user_input": [user_question], @@ -93,7 +94,12 @@ def test_run( ) mock_generate_answers_from_model.return_value = interim_df.copy() mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) - mock_ChatOpenAI.return_value = MagicMock() + mock_client = MagicMock() + mock_response = MagicMock() + mock_response.choices = [ + MagicMock(message=MagicMock(content=student_model_response)) + ] + mock_client.chat.completions.create.return_value = mock_response # Ragas requires this value to instantiate an EvaluationResult object, so we must provide it. # It isn't functionally used for our purposes though. @@ -109,29 +115,20 @@ def test_run( ) } mock_evaluate.return_value = EvaluationResult( - scores=[{mocked_metric: mocked_metric_score}], + scores=[{metric: metric_score}], dataset=mocked_evaluation_ds, ragas_traces=_unimportant_ragas_traces, ) - ######################################################################## - # Run the tests - ######################################################################## - - # Configure all other inputs that Ragas does not depend on for proper mocking - student_model = ModelConfig( - base_url="https://api.openai.com", - model_name="pt-3.5-turbo", - api_key="test-api-key", - ) - run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30) - evaluator = RagasEvaluator() - ######################################################################## # Test case: directly passing a dataset ######################################################################## + evaluator = RagasEvaluator() result = evaluator.run( - dataset=base_ds, student_model=student_model, run_config=run_config + dataset=base_ds, + student_model=student_model, + run_config=run_config, + openai_client=mock_client, ) self.assertIsInstance(result, EvaluationResult) @@ -142,11 +139,13 @@ def test_run( ######################################################################## # Test case: passing a dataset in via Path to JSONL file ######################################################################## + evaluator = RagasEvaluator() mock_read_json.return_value = DataFrame(base_ds) result = evaluator.run( dataset=Path("dummy_path.jsonl"), student_model=student_model, run_config=run_config, + openai_client=mock_client, ) self.assertIsInstance(result, EvaluationResult) @@ -156,6 +155,24 @@ def test_run( mock_generate_answers_from_model.assert_called() mock_evaluate.assert_called() + ######################################################################## + # Test case: using the instance attributes + ######################################################################## + evaluator = RagasEvaluator( + student_model=student_model, + openai_client=mock_client, + run_config=run_config, + ) + mock_read_json.return_value = DataFrame(base_ds) + result = evaluator.run(dataset=Path("dummy_path.jsonl")) + + self.assertIsInstance(result, EvaluationResult) + mock_read_json.assert_called_with( + Path("dummy_path.jsonl"), orient="records", lines=True + ) + mock_generate_answers_from_model.assert_called() + mock_evaluate.assert_called() + if __name__ == "__main__": unittest.main() From ab3d168d434f0098af1be8ebf6f4fa8ce2e4a40a Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Tue, 7 Jan 2025 16:07:43 -0500 Subject: [PATCH 6/6] chore: decouple tests into more atomic units Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- src/instructlab/eval/ragas.py | 112 +++++++++++----- tests/test_ragas.py | 238 ++++++++++++++++++++-------------- 2 files changed, 220 insertions(+), 130 deletions(-) diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py index 9515a95..f0445da 100644 --- a/src/instructlab/eval/ragas.py +++ b/src/instructlab/eval/ragas.py @@ -1,13 +1,14 @@ # # SPDX-License-Identifier: Apache-2.0 # Standard from pathlib import Path -from typing import List, Optional, TypedDict +from typing import TYPE_CHECKING, List, Optional, TypedDict # Third Party from langchain_community.chat_models import ChatOpenAI from openai import Client as OpenAIClient +from openai.types.chat import ChatCompletionMessageParam from pandas import DataFrame, read_json -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import BaseModel, ConfigDict, Field from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate from ragas.metrics import Metric from ragas.metrics._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private @@ -17,6 +18,9 @@ # Local from .evaluator import Evaluator +from .logger_config import setup_logger + +logger = setup_logger(__name__) class Sample(TypedDict): @@ -56,7 +60,7 @@ class ModelConfig(BaseModel): system_prompt: str = _DEFAULT_SYSTEM_PROMPT # "model randomness" aka likelihood of sampling something other than the likeliest token - temperature: float = 0.0 + temperature: float = Field(default=0.0, le=1.0, ge=0.0) # Max amount of tokens to generate. max_tokens: int = 768 @@ -64,13 +68,6 @@ class ModelConfig(BaseModel): # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes. seed: int = DEFAULT_SEED - @field_validator("temperature") - @classmethod - def check_temperature(cls, v: float) -> float: - if not 0.0 <= v <= 1.0: - raise ValueError("temperature must be between 0.0 and 1.0") - return v - class RagasEvaluator(Evaluator): # most basic implementation, we just assume that the user will bring the existing model responses @@ -80,18 +77,42 @@ def __init__( self, student_model: ModelConfig | None = None, run_config: RunConfig | None = None, - openai_client: OpenAIClient | None = None, + student_openai_client: OpenAIClient | None = None, + judge_model_name: str = DEFAULT_JUDGE_MODEL, + judge_openai_api_key: str | None = None, ): self.student_model = student_model self.run_config = run_config - self.openai_client = openai_client + self.student_openai_client = student_openai_client + self.judge_model_name = judge_model_name + self.judge_openai_api_key = judge_openai_api_key + + @staticmethod + def _validate_dataset(df: DataFrame): + """ + Validates whether or not the given `df` is a valid dataset of `Sample` objects. + + Args: + df (DataFrame): DataFrame containing the dataset to be evaluated. + """ + # We have to hardcode these fields because the automated way of resolving the required fields from a TypedDict + # is only included by default in Python3.11+. For earlier versions, the `typing_extensions` package is required. + # See: https://docs.python.org/3/whatsnew/3.11.html#pep-655-marking-individual-typeddict-items-as-required-or-not-required + required_keys = {"user_input", "reference"} + missing_keys = required_keys - set(df.columns) + if missing_keys: + raise ValueError( + f"invalid dataset provided, missing the following keys: {', '.join(missing_keys)}" + ) def run( self, dataset: List[Sample] | Path, student_model: ModelConfig | None = None, run_config: RunConfig | None = None, - openai_client: OpenAIClient | None = None, + student_openai_client: OpenAIClient | None = None, + judge_model_name: str | None = None, + judge_openai_api_key: str | None = None, ) -> EvaluationResult: """ Evaluates the quality of model responses against a graded rubric. @@ -111,21 +132,31 @@ def run( a default one is created containing extremely permissive settings when handling timeouts. This is because by default, OpenAI tier-1 usage accounts have very high rate limits resulting in heavy throttling during evaluations. - openai_client (openai.Client | None, optional): + student_openai_client (openai.Client | None, optional): The client to use when generating questions from the student model, must be compatible with the OpenAI API. This field is required when `student_model` is provided. + judge_model_name (str | None, optional): + Name of the OpenAI model to use as the judge model. Defaults to "gpt-4o" when none is specified. + judge_openai_api_key (str | None, optional): + The API key to use for evaluating the given dataset. When this isn't provided, `OPENAI_API_KEY` is read instead. + Returns: EvaluationResult: The results of all evaluations performed by Ragas """ + judge_model_name = ( + judge_model_name if judge_model_name else self.judge_model_name + ) + judge_openai_api_key = ( + judge_openai_api_key if judge_openai_api_key else self.judge_openai_api_key + ) student_model = student_model if student_model else self.student_model run_config = run_config if run_config else self.run_config - openai_client = openai_client if openai_client else self.openai_client - - if not dataset: - raise ValueError( - "no dataset was provided, please specify the `dataset` argument" - ) + student_openai_client = ( + student_openai_client + if student_openai_client + else self.student_openai_client + ) # ensure we are in the dataframe format input_df = None @@ -137,22 +168,30 @@ def run( raise TypeError(f"invalid type of dataset: {type(dataset)}") # this should never happen, but pylint is not smart enough to detect it - assert input_df is not None + if TYPE_CHECKING: + assert input_df is not None + + # ensure the dataset is in the format we expect it + self._validate_dataset(input_df) need_to_generate_questions = "response" not in input_df.columns - if need_to_generate_questions and (not student_model or not openai_client): - raise ValueError( - "provided dataset doesn't contain the model `response`, but either `student_model` or `openai_client` wasn't provided for inference" + if need_to_generate_questions: + logger.debug( + "`response` is missing in the input dataframe columns, generating questions from the model is required." ) + if not student_model or not student_openai_client: + raise ValueError( + "provided dataset doesn't contain the model `response`, but either `student_model` or `student_openai_client` wasn't provided for inference" + ) # if the student model was provided then we always generate regardless if student_model: - if not openai_client: + if not student_openai_client: raise ValueError( - "`student_model` was specified but `openai_client` was not provided" + "`student_model` was specified but `student_openai_client` was not provided" ) input_df = self._generate_answers_from_model( - input_df, student_model, openai_client + input_df, student_model, student_openai_client ) if not run_config: @@ -170,7 +209,8 @@ def run( # we will be using gpt-4o for the foreseeable future, we hardcode this # for consistency of answers - critic_lm = ChatOpenAI(model=DEFAULT_JUDGE_MODEL) + + critic_lm = ChatOpenAI(model=judge_model_name, api_key=judge_openai_api_key) results = evaluate( dataset=evaluation_ds, batch_size=4, @@ -185,7 +225,7 @@ def _generate_answers_from_model( self, questions: DataFrame, student_model: ModelConfig, - openai_client: OpenAIClient, + student_openai_client: OpenAIClient, ) -> DataFrame: """ Given a DataFrame containing `user_input` columns, generates responses from the given model @@ -196,11 +236,14 @@ def _generate_answers_from_model( updated_df["response"] = "" for i, qna in updated_df.iterrows(): - messages = [ - student_model.system_prompt, - qna["user_input"], + messages: List[ChatCompletionMessageParam] = [ + { + "role": "system", + "content": student_model.system_prompt, + }, + {"role": "user", "content": qna["user_input"]}, ] - response = openai_client.chat.completions.create( + response = student_openai_client.chat.completions.create( messages=messages, model=student_model.model_name, # specify the seed so we can at least try to have some reproducibility when the clients support it @@ -211,7 +254,8 @@ def _generate_answers_from_model( updated_df.at[i, "response"] = response.choices[0].message.content return updated_df - def _get_metrics(self) -> List[Metric]: + @staticmethod + def _get_metrics() -> List[Metric]: # default set of metrics return [ RubricsScore( diff --git a/tests/test_ragas.py b/tests/test_ragas.py index ebabb2b..1d3bb8f 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -1,4 +1,4 @@ -# # SPDX-License-Identifier: Apache-2.0 +# SPDX-License-Identifier: Apache-2.0 # Standard from pathlib import Path from unittest.mock import MagicMock, patch @@ -8,102 +8,55 @@ from pandas import DataFrame from ragas.callbacks import ChainRun from ragas.dataset_schema import EvaluationDataset, EvaluationResult -import pandas as pd # First Party from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig class TestRagasEvaluator(unittest.TestCase): - def test_generate_answers_from_model(self): - # mock the OpenAI client to always return "london" for chat completions - user_input = "What is the capital of France?" - model_response = "London" - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.choices = [MagicMock(message=MagicMock(content=model_response))] - mock_client.chat.completions.create.return_value = mock_response - - # get answers - questions = pd.DataFrame({"user_input": [user_input]}) - student_model = ModelConfig( + def setUp(self): + # Common setup data for all tests + self.student_model_response = "Paris" + self.user_question = "What is the capital of France?" + self.golden_answer = "The capital of France is Paris." + self.metric = "mocked-metric" + self.metric_score = 4.0 + self.base_ds = [ + { + "user_input": self.user_question, + "reference": self.golden_answer, + } + ] + self.student_model = ModelConfig( model_name="super-jeeves-8x700B", ) - evaluator = RagasEvaluator() - result_df = evaluator._generate_answers_from_model( - questions, student_model, mock_client - ) - - # what we expect to see - expected_df = questions.copy() - expected_df["response"] = [model_response] - - # perform the assertions - pd.testing.assert_frame_equal(result_df, expected_df) - mock_client.chat.completions.create.assert_called_once_with( - messages=[student_model.system_prompt, user_input], - model=student_model.model_name, - seed=42, - max_tokens=student_model.max_tokens, - temperature=student_model.temperature, - ) + self.run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30) @patch("instructlab.eval.ragas.ChatOpenAI") - @patch("instructlab.eval.ragas.read_json") @patch("instructlab.eval.ragas.evaluate") @patch.object(RagasEvaluator, "_generate_answers_from_model") @patch.object(RagasEvaluator, "_get_metrics") - def test_run( + def test_run_with_dataset( self, mock_get_metrics: MagicMock, mock_generate_answers_from_model: MagicMock, mock_evaluate: MagicMock, - mock_read_json: MagicMock, mock_ChatOpenAI: MagicMock, ): - ######################################################################## - # SETUP EVERYTHING WE NEED FOR THE TESTS - ######################################################################## - - # These are the variables which will control the flow of the test. - # Since we have to re-construct some Ragas components under the hood, - - student_model_response = "Paris" - user_question = "What is the capital of France?" - golden_answer = "The capital of France is Paris." - metric = "mocked-metric" - metric_score = 4.0 - base_ds = [{"user_input": user_question, "reference": golden_answer}] - student_model = ModelConfig( - model_name="super-jeeves-8x700B", - ) - run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30) - - # The following section takes care of mocking function return calls. - # Ragas is tricky because it has some complex data structures under the hood, - # so what we have to do is configure the intermediate outputs that we expect - # to receive from Ragas. - - mock_get_metrics.return_value = [metric] + """ + Test case 1: Directly passing a Python list/dict dataset to `RagasEvaluator.run()`. + """ + # Prepare mocks + mock_get_metrics.return_value = [self.metric] interim_df = DataFrame( { - "user_input": [user_question], - "response": [student_model_response], - "reference": [golden_answer], + "user_input": [self.user_question], + "response": [self.student_model_response], + "reference": [self.golden_answer], } ) - mock_generate_answers_from_model.return_value = interim_df.copy() + mock_generate_answers_from_model.return_value = interim_df mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.choices = [ - MagicMock(message=MagicMock(content=student_model_response)) - ] - mock_client.chat.completions.create.return_value = mock_response - - # Ragas requires this value to instantiate an EvaluationResult object, so we must provide it. - # It isn't functionally used for our purposes though. - _unimportant_ragas_traces = { "default": ChainRun( run_id="42", @@ -115,39 +68,86 @@ def test_run( ) } mock_evaluate.return_value = EvaluationResult( - scores=[{metric: metric_score}], + scores=[{self.metric: self.metric_score}], dataset=mocked_evaluation_ds, ragas_traces=_unimportant_ragas_traces, ) - ######################################################################## - # Test case: directly passing a dataset - ######################################################################## + # Instantiate evaluator evaluator = RagasEvaluator() + + # Run test result = evaluator.run( - dataset=base_ds, - student_model=student_model, - run_config=run_config, - openai_client=mock_client, + dataset=self.base_ds, + student_model=self.student_model, + run_config=self.run_config, + student_openai_client=MagicMock(), # We pass a mock client ) + # Assertions self.assertIsInstance(result, EvaluationResult) mock_generate_answers_from_model.assert_called_once() mock_evaluate.assert_called_once() - mock_ChatOpenAI.assert_called_once_with(model="gpt-4o") + # we didn't provide an API key, so it expects to get `api_key=None` + mock_ChatOpenAI.assert_called_once_with(model="gpt-4o", api_key=None) - ######################################################################## - # Test case: passing a dataset in via Path to JSONL file - ######################################################################## + @patch("instructlab.eval.ragas.ChatOpenAI") + @patch("instructlab.eval.ragas.read_json") + @patch("instructlab.eval.ragas.evaluate") + @patch.object(RagasEvaluator, "_generate_answers_from_model") + @patch.object(RagasEvaluator, "_get_metrics") + def test_run_with_dataset_via_path( + self, + mock_get_metrics: MagicMock, + mock_generate_answers_from_model: MagicMock, + mock_evaluate: MagicMock, + mock_read_json: MagicMock, + mock_ChatOpenAI: MagicMock, + ): + """ + Test case 2: Passing a Path to a JSONL file (containing the dataset) to `RagasEvaluator.run()`. + """ + # Prepare mocks + mock_get_metrics.return_value = [self.metric] + interim_df = DataFrame( + { + "user_input": [self.user_question], + "response": [self.student_model_response], + "reference": [self.golden_answer], + } + ) + mock_generate_answers_from_model.return_value = interim_df + mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) + _unimportant_ragas_traces = { + "default": ChainRun( + run_id="42", + parent_run_id=None, + name="root", + inputs={"system": "null", "user": "null"}, + outputs={"assistant": "null"}, + metadata={"user_id": 1337}, + ) + } + mock_evaluate.return_value = EvaluationResult( + scores=[{self.metric: self.metric_score}], + dataset=mocked_evaluation_ds, + ragas_traces=_unimportant_ragas_traces, + ) + + mock_read_json.return_value = DataFrame(self.base_ds) + + # Instantiate evaluator evaluator = RagasEvaluator() - mock_read_json.return_value = DataFrame(base_ds) + + # Run test result = evaluator.run( dataset=Path("dummy_path.jsonl"), - student_model=student_model, - run_config=run_config, - openai_client=mock_client, + student_model=self.student_model, + run_config=self.run_config, + student_openai_client=MagicMock(), ) + # Assertions self.assertIsInstance(result, EvaluationResult) mock_read_json.assert_called_once_with( Path("dummy_path.jsonl"), orient="records", lines=True @@ -155,17 +155,63 @@ def test_run( mock_generate_answers_from_model.assert_called() mock_evaluate.assert_called() - ######################################################################## - # Test case: using the instance attributes - ######################################################################## + @patch("instructlab.eval.ragas.ChatOpenAI") + @patch("instructlab.eval.ragas.read_json") + @patch("instructlab.eval.ragas.evaluate") + @patch.object(RagasEvaluator, "_generate_answers_from_model") + @patch.object(RagasEvaluator, "_get_metrics") + def test_run_with_instance_attributes( + self, + mock_get_metrics: MagicMock, + mock_generate_answers_from_model: MagicMock, + mock_evaluate: MagicMock, + mock_read_json: MagicMock, + mock_ChatOpenAI: MagicMock, + ): + """ + Test case 3: Using `RagasEvaluator` instance attributes for `student_model`, `run_config`, + and `student_openai_client` instead of passing them explicitly. + """ + # Prepare mocks + mock_get_metrics.return_value = [self.metric] + interim_df = DataFrame( + { + "user_input": [self.user_question], + "response": [self.student_model_response], + "reference": [self.golden_answer], + } + ) + mock_generate_answers_from_model.return_value = interim_df + mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) + _unimportant_ragas_traces = { + "default": ChainRun( + run_id="42", + parent_run_id=None, + name="root", + inputs={"system": "null", "user": "null"}, + outputs={"assistant": "null"}, + metadata={"user_id": 1337}, + ) + } + mock_evaluate.return_value = EvaluationResult( + scores=[{self.metric: self.metric_score}], + dataset=mocked_evaluation_ds, + ragas_traces=_unimportant_ragas_traces, + ) + + mock_read_json.return_value = DataFrame(self.base_ds) + + # Instantiate evaluator with instance-level configs evaluator = RagasEvaluator( - student_model=student_model, - openai_client=mock_client, - run_config=run_config, + student_model=self.student_model, + student_openai_client=MagicMock(), + run_config=self.run_config, ) - mock_read_json.return_value = DataFrame(base_ds) + + # Run test result = evaluator.run(dataset=Path("dummy_path.jsonl")) + # Assertions self.assertIsInstance(result, EvaluationResult) mock_read_json.assert_called_with( Path("dummy_path.jsonl"), orient="records", lines=True