From a65c3aaaca1d6f72b004372aff38289737d9b40f Mon Sep 17 00:00:00 2001 From: "b.nativi" Date: Wed, 4 Sep 2024 23:00:27 +0000 Subject: [PATCH] Add bleu_smoothing_function to evaluation parameters and text generation functions --- .../backend/metrics/test_text_generation.py | 2 ++ .../unit-tests/schemas/test_evaluation.py | 11 ++++++++ .../backend/metrics/text_generation.py | 19 ++++++++++++++ api/valor_api/schemas/evaluation.py | 13 ++++++++++ client/valor/coretypes.py | 5 ++++ client/valor/schemas/evaluation.py | 3 +++ docs/metrics.md | 2 +- .../client/datatype/test_data_generation.py | 1 + .../client/metrics/test_detection.py | 6 +++++ .../test_text_generation_with_mock_client.py | 26 +++++++++++++++++++ 10 files changed, 87 insertions(+), 1 deletion(-) diff --git a/api/tests/functional-tests/backend/metrics/test_text_generation.py b/api/tests/functional-tests/backend/metrics/test_text_generation.py index 669606433..27a85336b 100644 --- a/api/tests/functional-tests/backend/metrics/test_text_generation.py +++ b/api/tests/functional-tests/backend/metrics/test_text_generation.py @@ -2126,6 +2126,7 @@ def test__calculate_bleu_scores(): output = _calculate_sentence_bleu( predictions=example["prediction"], references=example["references"], + smoothing_function=example.get("smoothing_function", None), weights=example["weights"], ) assert ( @@ -2137,5 +2138,6 @@ def test__calculate_bleu_scores(): _calculate_sentence_bleu( predictions=example["prediction"], references=example["references"], + smoothing_function=example.get("smoothing_function", None), weights=example["weights"], ) diff --git a/api/tests/unit-tests/schemas/test_evaluation.py b/api/tests/unit-tests/schemas/test_evaluation.py index a6cd16c6a..27f55b8f5 100644 --- a/api/tests/unit-tests/schemas/test_evaluation.py +++ b/api/tests/unit-tests/schemas/test_evaluation.py @@ -91,6 +91,7 @@ def test_EvaluationParameters(llm_api_params): MetricType.Toxicity, ], llm_api_params=llm_api_params, + bleu_smoothing_function="method1", bleu_weights=[0.5, 0.25, 0.25, 0], rouge_types=[ROUGEType.ROUGE1, ROUGEType.ROUGELSUM], rouge_use_stemmer=True, @@ -185,6 +186,16 @@ def test_EvaluationParameters(llm_api_params): ], ) + # BLEU smoothing function name must be a valid option. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options. + with pytest.raises(ValidationError): + schemas.EvaluationParameters( + task_type=enums.TaskType.TEXT_GENERATION, + metrics_to_return=[ + MetricType.BLEU, + ], + bleu_smoothing_function="invalid_smoothing_function_name", + ) + # BLEU weights must be 0 <= weight <= 1. with pytest.raises(ValidationError): schemas.EvaluationParameters( diff --git a/api/valor_api/backend/metrics/text_generation.py b/api/valor_api/backend/metrics/text_generation.py index 7e4b14af0..2bb5f6a0c 100644 --- a/api/valor_api/backend/metrics/text_generation.py +++ b/api/valor_api/backend/metrics/text_generation.py @@ -133,6 +133,7 @@ def _calculate_rouge_scores( def _calculate_sentence_bleu( predictions: str | list[str], references: list[str] | list[list[str]], + smoothing_function: str | None = None, weights: list[float] = [0.25, 0.25, 0.25, 0.25], ) -> list[dict[str, float]]: """ @@ -144,6 +145,8 @@ def _calculate_sentence_bleu( The predictions to score. Each prediction should be a string with tokens separated by spaces. references: list[str] | list[list[str] A list of reference for each prediction or a list of several references per prediction. Each reference should be a string with tokens separated by spaces. + smoothing_function: str, optional + The method name of the smoothing function to use. Defaults to None. If None, then no smoothing will be used. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options. weights: list[float] The default BLEU calculates a score for up to 4-grams using uniform weights (this is called BLEU-4). To evaluate your translations with @@ -176,6 +179,11 @@ def _calculate_sentence_bleu( "prediction should be a str or list[str]. If prediction is a list[str], then references must be a list of lists." ) + if smoothing_function is not None: + smoothing_function = getattr( + bleu_score.SmoothingFunction(), smoothing_function + ) + output = defaultdict(float) tokenizer = RegexpTokenizer( r"\w+|\$[\d]+|[^\s\.]+" @@ -192,6 +200,7 @@ def _calculate_sentence_bleu( bleu_score.sentence_bleu( references=tokenized_references, hypothesis=tokenized_prediction, + smoothing_function=smoothing_function, weights=weights, ), # type: ignore ), @@ -436,10 +445,14 @@ def _compute_text_generation_metrics( bleu_params = metric_params.get("BLEU", {}) if not isinstance(bleu_params, dict): raise ValueError("BLEU parameters must be a dictionary.") + smoothing_function = bleu_params.get( + "smoothing_function", None + ) weights = bleu_params.get("weights", [0.25, 0.25, 0.25, 0.25]) bleu_metrics = _calculate_sentence_bleu( predictions=predictions, references=references, + smoothing_function=smoothing_function, weights=weights, ) @@ -731,6 +744,12 @@ def compute_text_generation_metrics( ) metric_params = {} + if parameters.bleu_smoothing_function is not None: + if "BLEU" not in metric_params: + metric_params["BLEU"] = {} + metric_params["BLEU"][ + "smoothing_function" + ] = parameters.bleu_smoothing_function if parameters.bleu_weights is not None: if "BLEU" not in metric_params: metric_params["BLEU"] = {} diff --git a/api/valor_api/schemas/evaluation.py b/api/valor_api/schemas/evaluation.py index 76a614904..99316a814 100644 --- a/api/valor_api/schemas/evaluation.py +++ b/api/valor_api/schemas/evaluation.py @@ -1,5 +1,6 @@ import datetime +from nltk.translate import bleu_score from pydantic import BaseModel, ConfigDict, field_validator, model_validator from valor_api.enums import ( @@ -43,6 +44,8 @@ class EvaluationParameters(BaseModel): The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5. pr_curve_max_examples: int The maximum number of datum examples to store when calculating PR curves. + bleu_smoothing_function: str, optional + The method name of the smoothing function to use when calculating BLEU scores. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options. bleu_weights: list[float], optional The weights to use when calculating BLEU scores. rouge_types: list[ROUGEType] @@ -62,6 +65,7 @@ class EvaluationParameters(BaseModel): recall_score_threshold: float | None = 0 pr_curve_iou_threshold: float = 0.5 pr_curve_max_examples: int = 1 + bleu_smoothing_function: str | None = None bleu_weights: list[float] | None = None rouge_types: list[ROUGEType] | None = None rouge_use_stemmer: bool | None = None @@ -184,6 +188,15 @@ def _validate_parameters(cls, values): "`llm_api_params` must be provided for LLM guided evaluations." ) + if values.bleu_smoothing_function is not None: + if not hasattr( + bleu_score.SmoothingFunction, + values.bleu_smoothing_function, + ): + raise ValueError( + f"BLEU smoothing function `{values.bleu_smoothing_function}` is not supported." + ) + if values.bleu_weights is not None: if not all( isinstance(weight, (int, float)) and 0 <= weight diff --git a/client/valor/coretypes.py b/client/valor/coretypes.py index 676133213..5dfab92fa 100644 --- a/client/valor/coretypes.py +++ b/client/valor/coretypes.py @@ -1115,6 +1115,7 @@ def evaluate_text_generation( if api_key is not None: llm_api_params["api_key"] = api_key + bleu_smoothing_function = None bleu_weights = None rouge_types = None rouge_use_stemmer = None @@ -1127,6 +1128,9 @@ def evaluate_text_generation( ) if MetricType.BLEU in metric_params: + bleu_smoothing_function = metric_params[MetricType.BLEU].get( + "smoothing_function" + ) bleu_weights = metric_params[MetricType.BLEU].get("weights") if MetricType.ROUGE in metric_params: @@ -1148,6 +1152,7 @@ def evaluate_text_generation( task_type=TaskType.TEXT_GENERATION, metrics_to_return=metrics_to_return, llm_api_params=llm_api_params, + bleu_smoothing_function=bleu_smoothing_function, bleu_weights=bleu_weights, rouge_types=rouge_types, rouge_use_stemmer=rouge_use_stemmer, diff --git a/client/valor/schemas/evaluation.py b/client/valor/schemas/evaluation.py index 77c7a0522..f1bf52845 100644 --- a/client/valor/schemas/evaluation.py +++ b/client/valor/schemas/evaluation.py @@ -32,6 +32,8 @@ class EvaluationParameters: The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5. pr_curve_max_examples: int The maximum number of datum examples to store when calculating PR curves. + bleu_smoothing_function: str, optional + The method name of the smoothing function to use when calculating BLEU scores. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options. bleu_weights: list[float], optional The weights to use when calculating BLEU scores. rouge_types: list[ROUGEType] @@ -51,6 +53,7 @@ class EvaluationParameters: recall_score_threshold: float = 0 pr_curve_iou_threshold: float = 0.5 pr_curve_max_examples: int = 1 + bleu_smoothing_function: Optional[str] = None bleu_weights: Optional[List[float]] = None rouge_types: Optional[List[ROUGEType]] = None rouge_use_stemmer: Optional[bool] = None diff --git a/docs/metrics.md b/docs/metrics.md index 5d61e70ac..da1e2126c 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -576,4 +576,4 @@ Uses BLEU (BiLingual Evaluation Understudy) is an algorithm for evaluating automatic summarization and machine translation software in natural language processing. BLEU's output is always a number between 0 and 1, where a score near 1 indicates that the hypothesis text is very similar to one or more of the reference texts. -Behind the scenes, we use [nltk.translate.bleu_score](https://www.nltk.org/_modules/nltk/translate/bleu_score.html) to calculate these scores. The default BLEU metric calculates a score for up to 4-grams using uniform weights (i.e., `weights=[.25, .25, .25, .25]`; also called BLEU-4). Users can pass their own `bleu_weights` to EvaluationParameters in order to change this default behavior and calculate other BLEU scores. \ No newline at end of file +Behind the scenes, we use [nltk.translate.bleu_score](https://www.nltk.org/_modules/nltk/translate/bleu_score.html) to calculate these scores. The default BLEU metric calculates a score for up to 4-grams using uniform weights (i.e., `weights=[.25, .25, .25, .25]`; also called BLEU-4). Users can pass their own `bleu_weights` to EvaluationParameters in order to change this default behavior and calculate other BLEU scores. Users can also specify `bleu_smoothing_function` to use a different smoothing function. \ No newline at end of file diff --git a/integration_tests/client/datatype/test_data_generation.py b/integration_tests/client/datatype/test_data_generation.py index 026d1e78e..beed0eac5 100644 --- a/integration_tests/client/datatype/test_data_generation.py +++ b/integration_tests/client/datatype/test_data_generation.py @@ -419,6 +419,7 @@ def test_generate_prediction_data(client: Client): ], "pr_curve_iou_threshold": 0.5, "pr_curve_max_examples": 1, + "bleu_smoothing_function": None, "bleu_weights": None, "rouge_types": None, "rouge_use_stemmer": None, diff --git a/integration_tests/client/metrics/test_detection.py b/integration_tests/client/metrics/test_detection.py index 780be8616..f6cc34fdd 100644 --- a/integration_tests/client/metrics/test_detection.py +++ b/integration_tests/client/metrics/test_detection.py @@ -168,6 +168,7 @@ def test_evaluate_detection( ], "pr_curve_iou_threshold": 0.5, "pr_curve_max_examples": 1, + "bleu_smoothing_function": None, "bleu_weights": None, "rouge_types": None, "rouge_use_stemmer": None, @@ -331,6 +332,7 @@ def test_evaluate_detection( ], "pr_curve_iou_threshold": 0.5, "pr_curve_max_examples": 1, + "bleu_smoothing_function": None, "bleu_weights": None, "rouge_types": None, "rouge_use_stemmer": None, @@ -413,6 +415,7 @@ def test_evaluate_detection( ], "pr_curve_iou_threshold": 0.5, "pr_curve_max_examples": 1, + "bleu_smoothing_function": None, "bleu_weights": None, "rouge_types": None, "rouge_use_stemmer": None, @@ -527,6 +530,7 @@ def test_evaluate_detection( ], "pr_curve_iou_threshold": 0.5, "pr_curve_max_examples": 1, + "bleu_smoothing_function": None, "bleu_weights": None, "rouge_types": None, "rouge_use_stemmer": None, @@ -755,6 +759,7 @@ def test_evaluate_detection_with_json_filters( ], "pr_curve_iou_threshold": 0.5, "pr_curve_max_examples": 1, + "bleu_smoothing_function": None, "bleu_weights": None, "rouge_types": None, "rouge_use_stemmer": None, @@ -2036,6 +2041,7 @@ def test_evaluate_detection_with_label_maps( ], "pr_curve_iou_threshold": 0.5, "pr_curve_max_examples": 1, + "bleu_smoothing_function": None, "bleu_weights": None, "rouge_types": None, "rouge_use_stemmer": None, diff --git a/integration_tests/client/metrics/test_text_generation_with_mock_client.py b/integration_tests/client/metrics/test_text_generation_with_mock_client.py index 4f5278ca9..d47761be5 100644 --- a/integration_tests/client/metrics/test_text_generation_with_mock_client.py +++ b/integration_tests/client/metrics/test_text_generation_with_mock_client.py @@ -249,6 +249,7 @@ def test_llm_evaluation_rag_with_mock_client( }, metric_params={ MetricType.BLEU: { + "smoothing_function": "method0", "weights": [0.25, 0.25, 0.25, 0.25], }, MetricType.ROUGE: { @@ -355,6 +356,31 @@ def test_llm_evaluation_rag_with_mock_client( expected_metrics[uid][metric_name] == m["value"] ), f"Failed for {uid} and {metric_name}" + # # Test different settings for metric params + # eval_job = model.evaluate_text_generation( + # datasets=dataset, + # metrics_to_return=metrics_to_return, + # llm_api_params={ + # "client": "mock", + # "data": { + # "model": "model", + # }, + # }, + # metric_params={ + # MetricType.BLEU: { + # "smoothing_function": "method3", + # }, + # MetricType.ROUGE: { + # "rouge_types": [ + # ROUGEType.ROUGE1, + # ROUGEType.ROUGE2, + # ROUGEType.ROUGEL, + # ], + # "use_stemmer": True, + # }, + # }, + # ) + # Must only specify text generation metrics. with pytest.raises(ValueError): eval_job = model.evaluate_text_generation(