Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adds basic ragas eval #193

Merged
merged 6 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ pandas
pandas-stubs
lm-eval>=0.4.4
httpx
ragas
264 changes: 264 additions & 0 deletions src/instructlab/eval/ragas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
# # SPDX-License-Identifier: Apache-2.0
# Standard
from pathlib import Path
from typing import TYPE_CHECKING, List, Optional, TypedDict

# Third Party
from langchain_community.chat_models import ChatOpenAI
from openai import Client as OpenAIClient
from openai.types.chat import ChatCompletionMessageParam
from pandas import DataFrame, read_json
from pydantic import BaseModel, ConfigDict, Field
from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
from ragas.metrics import Metric
from ragas.metrics._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private
DEFAULT_WITH_REFERENCE_RUBRICS,
RubricsScore,
)

# Local
from .evaluator import Evaluator
from .logger_config import setup_logger

logger = setup_logger(__name__)


class Sample(TypedDict):
"""
TypedDict of a sample that we accept when doing eval with Ragas.
We specifically use TypedDict here to be flexible with the input data we accept.
"""

# question
user_input: str

# model answer
response: Optional[str]

# golden answer
reference: str


# default system prompt we'll use when none is provided. Make it private as we don't intend this to be a public object
_DEFAULT_SYSTEM_PROMPT = """You are an advanced AI assistant designed to provide precise and accurate information.
Your primary goal is to answer queries with the most up-to-date and factual information available.
Focus on delivering clear, concise, and correct responses.
If you're uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.
Your responses should prioritize accuracy over all other considerations."""

DEFAULT_SEED = 1337
DEFAULT_JUDGE_MODEL = "gpt-4o"
abhi1092 marked this conversation as resolved.
Show resolved Hide resolved


class ModelConfig(BaseModel):
model_config = ConfigDict(protected_namespaces=())

# name of the model to use.
model_name: str

# The system prompt to be used when applying the chat template.
system_prompt: str = _DEFAULT_SYSTEM_PROMPT

# "model randomness" aka likelihood of sampling something other than the likeliest token
temperature: float = Field(default=0.0, le=1.0, ge=0.0)

# Max amount of tokens to generate.
max_tokens: int = 768

# Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes.
seed: int = DEFAULT_SEED


class RagasEvaluator(Evaluator):
# most basic implementation, we just assume that the user will bring the existing model responses
name = "ragas"

def __init__(
self,
student_model: ModelConfig | None = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can any of these actually be None?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, because the user isn't required to pass them in at initialization time.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might streamline the code if these are required in the constructor so you don't have to do checks below. Although you might be implementing to the standard of the Evaluator class, in which case a refactor might make sense in a subsequent PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point but it'd be fine to leave it as-is. This is also how a lot of other libraries including Ragas implement similar functionality.

run_config: RunConfig | None = None,
student_openai_client: OpenAIClient | None = None,
judge_model_name: str = DEFAULT_JUDGE_MODEL,
judge_openai_api_key: str | None = None,
):
self.student_model = student_model
self.run_config = run_config
self.student_openai_client = student_openai_client
self.judge_model_name = judge_model_name
self.judge_openai_api_key = judge_openai_api_key

@staticmethod
def _validate_dataset(df: DataFrame):
"""
Validates whether or not the given `df` is a valid dataset of `Sample` objects.

Args:
df (DataFrame): DataFrame containing the dataset to be evaluated.
"""
# We have to hardcode these fields because the automated way of resolving the required fields from a TypedDict
# is only included by default in Python3.11+. For earlier versions, the `typing_extensions` package is required.
# See: https://docs.python.org/3/whatsnew/3.11.html#pep-655-marking-individual-typeddict-items-as-required-or-not-required
required_keys = {"user_input", "reference"}
missing_keys = required_keys - set(df.columns)
if missing_keys:
raise ValueError(
f"invalid dataset provided, missing the following keys: {', '.join(missing_keys)}"
)

def run(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So for this a user is expected to bring a list of Sample objects, which hold the input, prediction, and ground truth? Are we going to provide a way to build this list of Samples from given files or lists of each category, or is this moreso just for use with self-built scripts that import the Sample object and build?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated it such that dataset now is either a pathlib.Path object or a list of samples, and we read what we need to accordingly

self,
dataset: List[Sample] | Path,
student_model: ModelConfig | None = None,
run_config: RunConfig | None = None,
student_openai_client: OpenAIClient | None = None,
judge_model_name: str | None = None,
judge_openai_api_key: str | None = None,
) -> EvaluationResult:
"""
Evaluates the quality of model responses against a graded rubric.

When the `dataset` lacks the `response` field, then `student_model` must be provided
in order to generate the answers.

Args:
dataset (List[Sample] | Path):
Can be either a list of `Sample` objects or a path to a jsonl file containing
records matching `Sample`.
student_model: (StudentModelConfig):
When this parameter is provided, we'll attempt to use the described model in order to
generate the responses from the given list of questions.
run_config (RunConfig | None, optional):
Configuration to use when running evaluations. If none is provided, then
a default one is created containing extremely permissive settings when handling
timeouts. This is because by default, OpenAI tier-1 usage accounts have very high
rate limits resulting in heavy throttling during evaluations.
student_openai_client (openai.Client | None, optional):
The client to use when generating questions from the student model, must be compatible with the OpenAI API.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the client for the student model or the judge model?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for the student model. In this PR we are making the opinionated stance that the judge model needs to be GPT-4o for consistent results.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense- could consider renaming api variables so the intention is clear:
student_model -> student_model_config
openai_client -> student_openai_client

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the user doesn't accidentally create a client to 4o-mini thinking that they literally need a client pointing to ChatGPT.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I think that's a good suggestion now that we've decoupled the OpenAI config from being in the student model. Good catch!

This field is required when `student_model` is provided.
judge_model_name (str | None, optional):
Name of the OpenAI model to use as the judge model. Defaults to "gpt-4o" when none is specified.
judge_openai_api_key (str | None, optional):
The API key to use for evaluating the given dataset. When this isn't provided, `OPENAI_API_KEY` is read instead.


Returns:
EvaluationResult: The results of all evaluations performed by Ragas
"""
judge_model_name = (
judge_model_name if judge_model_name else self.judge_model_name
)
judge_openai_api_key = (
judge_openai_api_key if judge_openai_api_key else self.judge_openai_api_key
)
student_model = student_model if student_model else self.student_model
run_config = run_config if run_config else self.run_config
student_openai_client = (
student_openai_client
if student_openai_client
else self.student_openai_client
)

# ensure we are in the dataframe format
input_df = None
if isinstance(dataset, list):
input_df = DataFrame(dataset)
elif isinstance(dataset, Path):
input_df = read_json(dataset, orient="records", lines=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there's an implicit requirement here that the dataset referred to by the path is well-formed (shaped like list[Sample]). Could consider doing a quick check to make sure the required columns are present in the df and failing here if they aren't.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I don't see a reason not to.

else:
raise TypeError(f"invalid type of dataset: {type(dataset)}")

# this should never happen, but pylint is not smart enough to detect it
if TYPE_CHECKING:
assert input_df is not None

# ensure the dataset is in the format we expect it
self._validate_dataset(input_df)

alimaredia marked this conversation as resolved.
Show resolved Hide resolved
need_to_generate_questions = "response" not in input_df.columns
alimaredia marked this conversation as resolved.
Show resolved Hide resolved
if need_to_generate_questions:
logger.debug(
"`response` is missing in the input dataframe columns, generating questions from the model is required."
)
if not student_model or not student_openai_client:
raise ValueError(
"provided dataset doesn't contain the model `response`, but either `student_model` or `student_openai_client` wasn't provided for inference"
)

# if the student model was provided then we always generate regardless
if student_model:
alimaredia marked this conversation as resolved.
Show resolved Hide resolved
if not student_openai_client:
raise ValueError(
"`student_model` was specified but `student_openai_client` was not provided"
)
input_df = self._generate_answers_from_model(
input_df, student_model, student_openai_client
)

if not run_config:
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
# are horrible and will result in half of our evaluation results being NaN or 0
run_config = RunConfig(
max_retries=120,
max_wait=7200,
seed=DEFAULT_SEED,
timeout=3600,
)

metrics = self._get_metrics()
evaluation_ds = EvaluationDataset.from_pandas(input_df)

# we will be using gpt-4o for the foreseeable future, we hardcode this
alimaredia marked this conversation as resolved.
Show resolved Hide resolved
# for consistency of answers

critic_lm = ChatOpenAI(model=judge_model_name, api_key=judge_openai_api_key)
results = evaluate(
dataset=evaluation_ds,
batch_size=4,
run_config=run_config,
llm=critic_lm,
metrics=metrics,
show_progress=True,
)
return results

def _generate_answers_from_model(
self,
questions: DataFrame,
student_model: ModelConfig,
student_openai_client: OpenAIClient,
) -> DataFrame:
"""
Given a DataFrame containing `user_input` columns, generates responses from the given model
and returns a new DataFrame containing its answers in the `response` column.
"""
# initialize response to write into
updated_df = questions.copy()
updated_df["response"] = ""

for i, qna in updated_df.iterrows():
messages: List[ChatCompletionMessageParam] = [
{
"role": "system",
"content": student_model.system_prompt,
},
{"role": "user", "content": qna["user_input"]},
]
response = student_openai_client.chat.completions.create(
messages=messages,
model=student_model.model_name,
# specify the seed so we can at least try to have some reproducibility when the clients support it
seed=42,
max_tokens=student_model.max_tokens,
temperature=student_model.temperature,
)
updated_df.at[i, "response"] = response.choices[0].message.content
return updated_df

@staticmethod
def _get_metrics() -> List[Metric]:
# default set of metrics
return [
RubricsScore(
rubrics=DEFAULT_WITH_REFERENCE_RUBRICS,
)
]
Loading
Loading