Skip to content

Commit

Permalink
feat: update the Ragas evaluator to have the OpenAI client as somethi…
Browse files Browse the repository at this point in the history
…ng that gets passed in to __init__

Signed-off-by: Oleg S <[email protected]>
  • Loading branch information
RobotSail committed Dec 13, 2024
1 parent e00d881 commit a157203
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 53 deletions.
39 changes: 22 additions & 17 deletions src/instructlab/eval/ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

# Third Party
from langchain_community.chat_models import ChatOpenAI
from openai import Client as OpenAIClient
from pandas import DataFrame, read_json
from pydantic import BaseModel, ConfigDict, field_validator
from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
Expand All @@ -16,7 +17,6 @@

# Local
from .evaluator import Evaluator
from .mt_bench_common import get_openai_client


class Sample(TypedDict):
Expand Down Expand Up @@ -49,19 +49,12 @@ class Sample(TypedDict):
class ModelConfig(BaseModel):
model_config = ConfigDict(protected_namespaces=())

# URL of the OpenAI server where the model shall be hosted.
base_url: str

# name of the model to use.
model_name: str

# The system prompt to be used when applying the chat template.
system_prompt: str = _DEFAULT_SYSTEM_PROMPT

# We do NOT read from OPENAI_API_KEY for the student model for security reasons (e.g. sending the API key to another client)
# To provide an OpenAI key, you must set it here; else the default is used.
api_key: str = "no-api-key"

# "model randomness" aka likelihood of sampling something other than the likeliest token
temperature: float = 0.0

Expand All @@ -87,15 +80,18 @@ def __init__(
self,
student_model: ModelConfig | None = None,
run_config: RunConfig | None = None,
openai_client: OpenAIClient | None = None,
):
self.student_model = student_model
self.run_config = run_config
self.openai_client = openai_client

def run(
self,
dataset: List[Sample] | Path,
student_model: ModelConfig | None = None,
run_config: RunConfig | None = None,
openai_client: OpenAIClient | None = None,
) -> EvaluationResult:
"""
Evaluates the quality of model responses against a graded rubric.
Expand All @@ -115,12 +111,16 @@ def run(
a default one is created containing extremely permissive settings when handling
timeouts. This is because by default, OpenAI tier-1 usage accounts have very high
rate limits resulting in heavy throttling during evaluations.
openai_client (openai.Client | None, optional):
The client to use when generating questions from the student model, must be compatible with the OpenAI API.
This field is required when `student_model` is provided.
Returns:
EvaluationResult: The results of all evaluations performed by Ragas
"""
student_model = student_model if student_model else self.student_model
run_config = run_config if run_config else self.run_config
openai_client = openai_client if openai_client else self.openai_client

if not dataset:
raise ValueError(
Expand All @@ -140,14 +140,20 @@ def run(
assert input_df is not None

need_to_generate_questions = "response" not in input_df.columns
if need_to_generate_questions and not student_model:
if need_to_generate_questions and (not student_model or not openai_client):
raise ValueError(
"provided dataset doesn't contain the model `response`, but no `student_model` was provided for inference"
"provided dataset doesn't contain the model `response`, but either `student_model` or `openai_client` wasn't provided for inference"
)

# if the student model was provided then we always generate regardless
if student_model:
input_df = self._generate_answers_from_model(input_df, student_model)
if not openai_client:
raise ValueError(
"`student_model` was specified but `openai_client` was not provided"
)
input_df = self._generate_answers_from_model(
input_df, student_model, openai_client
)

if not run_config:
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
Expand Down Expand Up @@ -176,16 +182,15 @@ def run(
return results

def _generate_answers_from_model(
self, questions: DataFrame, student_model: ModelConfig
self,
questions: DataFrame,
student_model: ModelConfig,
openai_client: OpenAIClient,
) -> DataFrame:
"""
Given a DataFrame containing `user_input` columns, generates responses from the given model
and returns a new DataFrame containing its answers in the `response` column.
"""
client = get_openai_client(
model_api_base=student_model.base_url, api_key=student_model.api_key
)

# initialize response to write into
updated_df = questions.copy()
updated_df["response"] = ""
Expand All @@ -195,7 +200,7 @@ def _generate_answers_from_model(
student_model.system_prompt,
qna["user_input"],
]
response = client.chat.completions.create(
response = openai_client.chat.completions.create(
messages=messages,
model=student_model.model_name,
# specify the seed so we can at least try to have some reproducibility when the clients support it
Expand Down
89 changes: 53 additions & 36 deletions tests/test_ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,58 +11,55 @@
import pandas as pd

# First Party
from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig, Sample
from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig


class TestRagasEvaluator(unittest.TestCase):
@patch("instructlab.eval.ragas.get_openai_client")
def test_generate_answers_from_model(self, mock_get_openai_client):
def test_generate_answers_from_model(self):
# mock the OpenAI client to always return "london" for chat completions
user_input = "What is the capital of France?"
model_response = "London"
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.choices[0].message.content = "London"
mock_response.choices = [MagicMock(message=MagicMock(content=model_response))]
mock_client.chat.completions.create.return_value = mock_response
mock_get_openai_client.return_value = mock_client

# get answers
questions = pd.DataFrame({"user_input": ["What is the capital of France?"]})
questions = pd.DataFrame({"user_input": [user_input]})
student_model = ModelConfig(
base_url="https://your.model.endpoint.com",
model_name="jeeves-512B",
api_key="test-api-key",
model_name="super-jeeves-8x700B",
)
evaluator = RagasEvaluator()
result_df = evaluator._generate_answers_from_model(questions, student_model)
result_df = evaluator._generate_answers_from_model(
questions, student_model, mock_client
)

# what we expect to see
expected_df = questions.copy()
expected_df["response"] = ["London"]
expected_df["response"] = [model_response]

# perform the assertions
pd.testing.assert_frame_equal(result_df, expected_df)
mock_get_openai_client.assert_called_once_with(
model_api_base=student_model.base_url, api_key=student_model.api_key
)
mock_client.chat.completions.create.assert_called_once_with(
messages=[student_model.system_prompt, "What is the capital of France?"],
messages=[student_model.system_prompt, user_input],
model=student_model.model_name,
seed=42,
max_tokens=student_model.max_tokens,
temperature=student_model.temperature,
)

@patch("instructlab.eval.ragas.ChatOpenAI")
@patch("instructlab.eval.ragas.read_json")
@patch("instructlab.eval.ragas.evaluate")
@patch("instructlab.eval.ragas.ChatOpenAI")
@patch.object(RagasEvaluator, "_generate_answers_from_model")
@patch.object(RagasEvaluator, "_get_metrics")
def test_run(
self,
mock_get_metrics: MagicMock,
mock_generate_answers_from_model: MagicMock,
mock_ChatOpenAI: MagicMock,
mock_evaluate: MagicMock,
mock_read_json: MagicMock,
mock_ChatOpenAI: MagicMock,
):
########################################################################
# SETUP EVERYTHING WE NEED FOR THE TESTS
Expand All @@ -74,16 +71,20 @@ def test_run(
student_model_response = "Paris"
user_question = "What is the capital of France?"
golden_answer = "The capital of France is Paris."
metric = "mocked-metric"
metric_score = 4.0
base_ds = [{"user_input": user_question, "reference": golden_answer}]
mocked_metric = "mocked-metric"
mocked_metric_score = 4.0
student_model = ModelConfig(
model_name="super-jeeves-8x700B",
)
run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)

# The following section takes care of mocking function return calls.
# Ragas is tricky because it has some complex data structures under the hood,
# so what we have to do is configure the intermediate outputs that we expect
# to receive from Ragas.

mock_get_metrics.return_value = [mocked_metric]
mock_get_metrics.return_value = [metric]
interim_df = DataFrame(
{
"user_input": [user_question],
Expand All @@ -93,7 +94,12 @@ def test_run(
)
mock_generate_answers_from_model.return_value = interim_df.copy()
mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
mock_ChatOpenAI.return_value = MagicMock()
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.choices = [
MagicMock(message=MagicMock(content=student_model_response))
]
mock_client.chat.completions.create.return_value = mock_response

# Ragas requires this value to instantiate an EvaluationResult object, so we must provide it.
# It isn't functionally used for our purposes though.
Expand All @@ -109,29 +115,20 @@ def test_run(
)
}
mock_evaluate.return_value = EvaluationResult(
scores=[{mocked_metric: mocked_metric_score}],
scores=[{metric: metric_score}],
dataset=mocked_evaluation_ds,
ragas_traces=_unimportant_ragas_traces,
)

########################################################################
# Run the tests
########################################################################

# Configure all other inputs that Ragas does not depend on for proper mocking
student_model = ModelConfig(
base_url="https://api.openai.com",
model_name="pt-3.5-turbo",
api_key="test-api-key",
)
run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)
evaluator = RagasEvaluator()

########################################################################
# Test case: directly passing a dataset
########################################################################
evaluator = RagasEvaluator()
result = evaluator.run(
dataset=base_ds, student_model=student_model, run_config=run_config
dataset=base_ds,
student_model=student_model,
run_config=run_config,
openai_client=mock_client,
)

self.assertIsInstance(result, EvaluationResult)
Expand All @@ -142,11 +139,13 @@ def test_run(
########################################################################
# Test case: passing a dataset in via Path to JSONL file
########################################################################
evaluator = RagasEvaluator()
mock_read_json.return_value = DataFrame(base_ds)
result = evaluator.run(
dataset=Path("dummy_path.jsonl"),
student_model=student_model,
run_config=run_config,
openai_client=mock_client,
)

self.assertIsInstance(result, EvaluationResult)
Expand All @@ -156,6 +155,24 @@ def test_run(
mock_generate_answers_from_model.assert_called()
mock_evaluate.assert_called()

########################################################################
# Test case: using the instance attributes
########################################################################
evaluator = RagasEvaluator(
student_model=student_model,
openai_client=mock_client,
run_config=run_config,
)
mock_read_json.return_value = DataFrame(base_ds)
result = evaluator.run(dataset=Path("dummy_path.jsonl"))

self.assertIsInstance(result, EvaluationResult)
mock_read_json.assert_called_with(
Path("dummy_path.jsonl"), orient="records", lines=True
)
mock_generate_answers_from_model.assert_called()
mock_evaluate.assert_called()


if __name__ == "__main__":
unittest.main()

0 comments on commit a157203

Please sign in to comment.