Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BLEU Smoothing Function #742

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2126,6 +2126,7 @@ def test__calculate_bleu_scores():
output = _calculate_sentence_bleu(
predictions=example["prediction"],
references=example["references"],
smoothing_function=example.get("smoothing_function", None),
weights=example["weights"],
)
assert (
Expand All @@ -2137,5 +2138,6 @@ def test__calculate_bleu_scores():
_calculate_sentence_bleu(
predictions=example["prediction"],
references=example["references"],
smoothing_function=example.get("smoothing_function", None),
weights=example["weights"],
)
11 changes: 11 additions & 0 deletions api/tests/unit-tests/schemas/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def test_EvaluationParameters(llm_api_params):
MetricType.Toxicity,
],
llm_api_params=llm_api_params,
bleu_smoothing_function="method1",
bleu_weights=[0.5, 0.25, 0.25, 0],
rouge_types=[ROUGEType.ROUGE1, ROUGEType.ROUGELSUM],
rouge_use_stemmer=True,
Expand Down Expand Up @@ -185,6 +186,16 @@ def test_EvaluationParameters(llm_api_params):
],
)

# BLEU smoothing function name must be a valid option. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options.
with pytest.raises(ValidationError):
schemas.EvaluationParameters(
task_type=enums.TaskType.TEXT_GENERATION,
metrics_to_return=[
MetricType.BLEU,
],
bleu_smoothing_function="invalid_smoothing_function_name",
)

# BLEU weights must be 0 <= weight <= 1.
with pytest.raises(ValidationError):
schemas.EvaluationParameters(
Expand Down
19 changes: 19 additions & 0 deletions api/valor_api/backend/metrics/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def _calculate_rouge_scores(
def _calculate_sentence_bleu(
predictions: str | list[str],
references: list[str] | list[list[str]],
smoothing_function: str | None = None,
weights: list[float] = [0.25, 0.25, 0.25, 0.25],
) -> list[dict[str, float]]:
"""
Expand All @@ -144,6 +145,8 @@ def _calculate_sentence_bleu(
The predictions to score. Each prediction should be a string with tokens separated by spaces.
references: list[str] | list[list[str]
A list of reference for each prediction or a list of several references per prediction. Each reference should be a string with tokens separated by spaces.
smoothing_function: str, optional
The method name of the smoothing function to use. Defaults to None. If None, then no smoothing will be used. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options.
weights: list[float]
The default BLEU calculates a score for up to 4-grams using uniform
weights (this is called BLEU-4). To evaluate your translations with
Expand Down Expand Up @@ -176,6 +179,11 @@ def _calculate_sentence_bleu(
"prediction should be a str or list[str]. If prediction is a list[str], then references must be a list of lists."
)

if smoothing_function is not None:
smoothing_function = getattr(
bleu_score.SmoothingFunction(), smoothing_function
)

output = defaultdict(float)
tokenizer = RegexpTokenizer(
r"\w+|\$[\d]+|[^\s\.]+"
Expand All @@ -192,6 +200,7 @@ def _calculate_sentence_bleu(
bleu_score.sentence_bleu(
references=tokenized_references,
hypothesis=tokenized_prediction,
smoothing_function=smoothing_function,
weights=weights,
), # type: ignore
),
Expand Down Expand Up @@ -436,10 +445,14 @@ def _compute_text_generation_metrics(
bleu_params = metric_params.get("BLEU", {})
if not isinstance(bleu_params, dict):
raise ValueError("BLEU parameters must be a dictionary.")
smoothing_function = bleu_params.get(
"smoothing_function", None
)
weights = bleu_params.get("weights", [0.25, 0.25, 0.25, 0.25])
bleu_metrics = _calculate_sentence_bleu(
predictions=predictions,
references=references,
smoothing_function=smoothing_function,
weights=weights,
)

Expand Down Expand Up @@ -731,6 +744,12 @@ def compute_text_generation_metrics(
)

metric_params = {}
if parameters.bleu_smoothing_function is not None:
if "BLEU" not in metric_params:
metric_params["BLEU"] = {}
metric_params["BLEU"][
"smoothing_function"
] = parameters.bleu_smoothing_function
if parameters.bleu_weights is not None:
if "BLEU" not in metric_params:
metric_params["BLEU"] = {}
Expand Down
13 changes: 13 additions & 0 deletions api/valor_api/schemas/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime

from nltk.translate import bleu_score
from pydantic import BaseModel, ConfigDict, field_validator, model_validator

from valor_api.enums import (
Expand Down Expand Up @@ -43,6 +44,8 @@ class EvaluationParameters(BaseModel):
The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5.
pr_curve_max_examples: int
The maximum number of datum examples to store when calculating PR curves.
bleu_smoothing_function: str, optional
The method name of the smoothing function to use when calculating BLEU scores. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options.
bleu_weights: list[float], optional
The weights to use when calculating BLEU scores.
rouge_types: list[ROUGEType]
Expand All @@ -62,6 +65,7 @@ class EvaluationParameters(BaseModel):
recall_score_threshold: float | None = 0
pr_curve_iou_threshold: float = 0.5
pr_curve_max_examples: int = 1
bleu_smoothing_function: str | None = None
bleu_weights: list[float] | None = None
rouge_types: list[ROUGEType] | None = None
rouge_use_stemmer: bool | None = None
Expand Down Expand Up @@ -184,6 +188,15 @@ def _validate_parameters(cls, values):
"`llm_api_params` must be provided for LLM guided evaluations."
)

if values.bleu_smoothing_function is not None:
if not hasattr(
bleu_score.SmoothingFunction,
values.bleu_smoothing_function,
):
raise ValueError(
f"BLEU smoothing function `{values.bleu_smoothing_function}` is not supported."
)

if values.bleu_weights is not None:
if not all(
isinstance(weight, (int, float)) and 0 <= weight
Expand Down
5 changes: 5 additions & 0 deletions client/valor/coretypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1115,6 +1115,7 @@ def evaluate_text_generation(
if api_key is not None:
llm_api_params["api_key"] = api_key

bleu_smoothing_function = None
bleu_weights = None
rouge_types = None
rouge_use_stemmer = None
Expand All @@ -1127,6 +1128,9 @@ def evaluate_text_generation(
)

if MetricType.BLEU in metric_params:
bleu_smoothing_function = metric_params[MetricType.BLEU].get(
"smoothing_function"
)
bleu_weights = metric_params[MetricType.BLEU].get("weights")

if MetricType.ROUGE in metric_params:
Expand All @@ -1148,6 +1152,7 @@ def evaluate_text_generation(
task_type=TaskType.TEXT_GENERATION,
metrics_to_return=metrics_to_return,
llm_api_params=llm_api_params,
bleu_smoothing_function=bleu_smoothing_function,
bleu_weights=bleu_weights,
rouge_types=rouge_types,
rouge_use_stemmer=rouge_use_stemmer,
Expand Down
3 changes: 3 additions & 0 deletions client/valor/schemas/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class EvaluationParameters:
The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5.
pr_curve_max_examples: int
The maximum number of datum examples to store when calculating PR curves.
bleu_smoothing_function: str, optional
The method name of the smoothing function to use when calculating BLEU scores. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options.
bleu_weights: list[float], optional
The weights to use when calculating BLEU scores.
rouge_types: list[ROUGEType]
Expand All @@ -51,6 +53,7 @@ class EvaluationParameters:
recall_score_threshold: float = 0
pr_curve_iou_threshold: float = 0.5
pr_curve_max_examples: int = 1
bleu_smoothing_function: Optional[str] = None
bleu_weights: Optional[List[float]] = None
rouge_types: Optional[List[ROUGEType]] = None
rouge_use_stemmer: Optional[bool] = None
Expand Down
2 changes: 1 addition & 1 deletion docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -576,4 +576,4 @@ Uses

BLEU (BiLingual Evaluation Understudy) is an algorithm for evaluating automatic summarization and machine translation software in natural language processing. BLEU's output is always a number between 0 and 1, where a score near 1 indicates that the hypothesis text is very similar to one or more of the reference texts.

Behind the scenes, we use [nltk.translate.bleu_score](https://www.nltk.org/_modules/nltk/translate/bleu_score.html) to calculate these scores. The default BLEU metric calculates a score for up to 4-grams using uniform weights (i.e., `weights=[.25, .25, .25, .25]`; also called BLEU-4). Users can pass their own `bleu_weights` to EvaluationParameters in order to change this default behavior and calculate other BLEU scores.
Behind the scenes, we use [nltk.translate.bleu_score](https://www.nltk.org/_modules/nltk/translate/bleu_score.html) to calculate these scores. The default BLEU metric calculates a score for up to 4-grams using uniform weights (i.e., `weights=[.25, .25, .25, .25]`; also called BLEU-4). Users can pass their own `bleu_weights` to EvaluationParameters in order to change this default behavior and calculate other BLEU scores. Users can also specify `bleu_smoothing_function` to use a different smoothing function.
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ def test_generate_prediction_data(client: Client):
],
"pr_curve_iou_threshold": 0.5,
"pr_curve_max_examples": 1,
"bleu_smoothing_function": None,
"bleu_weights": None,
"rouge_types": None,
"rouge_use_stemmer": None,
Expand Down
6 changes: 6 additions & 0 deletions integration_tests/client/metrics/test_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def test_evaluate_detection(
],
"pr_curve_iou_threshold": 0.5,
"pr_curve_max_examples": 1,
"bleu_smoothing_function": None,
"bleu_weights": None,
"rouge_types": None,
"rouge_use_stemmer": None,
Expand Down Expand Up @@ -331,6 +332,7 @@ def test_evaluate_detection(
],
"pr_curve_iou_threshold": 0.5,
"pr_curve_max_examples": 1,
"bleu_smoothing_function": None,
"bleu_weights": None,
"rouge_types": None,
"rouge_use_stemmer": None,
Expand Down Expand Up @@ -413,6 +415,7 @@ def test_evaluate_detection(
],
"pr_curve_iou_threshold": 0.5,
"pr_curve_max_examples": 1,
"bleu_smoothing_function": None,
"bleu_weights": None,
"rouge_types": None,
"rouge_use_stemmer": None,
Expand Down Expand Up @@ -527,6 +530,7 @@ def test_evaluate_detection(
],
"pr_curve_iou_threshold": 0.5,
"pr_curve_max_examples": 1,
"bleu_smoothing_function": None,
"bleu_weights": None,
"rouge_types": None,
"rouge_use_stemmer": None,
Expand Down Expand Up @@ -755,6 +759,7 @@ def test_evaluate_detection_with_json_filters(
],
"pr_curve_iou_threshold": 0.5,
"pr_curve_max_examples": 1,
"bleu_smoothing_function": None,
"bleu_weights": None,
"rouge_types": None,
"rouge_use_stemmer": None,
Expand Down Expand Up @@ -2036,6 +2041,7 @@ def test_evaluate_detection_with_label_maps(
],
"pr_curve_iou_threshold": 0.5,
"pr_curve_max_examples": 1,
"bleu_smoothing_function": None,
"bleu_weights": None,
"rouge_types": None,
"rouge_use_stemmer": None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ def test_llm_evaluation_rag_with_mock_client(
},
metric_params={
MetricType.BLEU: {
"smoothing_function": "method0",
"weights": [0.25, 0.25, 0.25, 0.25],
},
MetricType.ROUGE: {
Expand Down Expand Up @@ -355,6 +356,31 @@ def test_llm_evaluation_rag_with_mock_client(
expected_metrics[uid][metric_name] == m["value"]
), f"Failed for {uid} and {metric_name}"

# # Test different settings for metric params
# eval_job = model.evaluate_text_generation(
# datasets=dataset,
# metrics_to_return=metrics_to_return,
# llm_api_params={
# "client": "mock",
# "data": {
# "model": "model",
# },
# },
# metric_params={
# MetricType.BLEU: {
# "smoothing_function": "method3",
# },
# MetricType.ROUGE: {
# "rouge_types": [
# ROUGEType.ROUGE1,
# ROUGEType.ROUGE2,
# ROUGEType.ROUGEL,
# ],
# "use_stemmer": True,
# },
# },
# )

# Must only specify text generation metrics.
with pytest.raises(ValueError):
eval_job = model.evaluate_text_generation(
Expand Down
Loading