From a65c3aaaca1d6f72b004372aff38289737d9b40f Mon Sep 17 00:00:00 2001
From: "b.nativi" <b.nativi@lambda002.america.striveworks.us>
Date: Wed, 4 Sep 2024 23:00:27 +0000
Subject: [PATCH] Add bleu_smoothing_function to evaluation parameters and text
 generation functions

---
 .../backend/metrics/test_text_generation.py   |  2 ++
 .../unit-tests/schemas/test_evaluation.py     | 11 ++++++++
 .../backend/metrics/text_generation.py        | 19 ++++++++++++++
 api/valor_api/schemas/evaluation.py           | 13 ++++++++++
 client/valor/coretypes.py                     |  5 ++++
 client/valor/schemas/evaluation.py            |  3 +++
 docs/metrics.md                               |  2 +-
 .../client/datatype/test_data_generation.py   |  1 +
 .../client/metrics/test_detection.py          |  6 +++++
 .../test_text_generation_with_mock_client.py  | 26 +++++++++++++++++++
 10 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/api/tests/functional-tests/backend/metrics/test_text_generation.py b/api/tests/functional-tests/backend/metrics/test_text_generation.py
index 669606433..27a85336b 100644
--- a/api/tests/functional-tests/backend/metrics/test_text_generation.py
+++ b/api/tests/functional-tests/backend/metrics/test_text_generation.py
@@ -2126,6 +2126,7 @@ def test__calculate_bleu_scores():
         output = _calculate_sentence_bleu(
             predictions=example["prediction"],
             references=example["references"],
+            smoothing_function=example.get("smoothing_function", None),
             weights=example["weights"],
         )
         assert (
@@ -2137,5 +2138,6 @@ def test__calculate_bleu_scores():
             _calculate_sentence_bleu(
                 predictions=example["prediction"],
                 references=example["references"],
+                smoothing_function=example.get("smoothing_function", None),
                 weights=example["weights"],
             )
diff --git a/api/tests/unit-tests/schemas/test_evaluation.py b/api/tests/unit-tests/schemas/test_evaluation.py
index a6cd16c6a..27f55b8f5 100644
--- a/api/tests/unit-tests/schemas/test_evaluation.py
+++ b/api/tests/unit-tests/schemas/test_evaluation.py
@@ -91,6 +91,7 @@ def test_EvaluationParameters(llm_api_params):
             MetricType.Toxicity,
         ],
         llm_api_params=llm_api_params,
+        bleu_smoothing_function="method1",
         bleu_weights=[0.5, 0.25, 0.25, 0],
         rouge_types=[ROUGEType.ROUGE1, ROUGEType.ROUGELSUM],
         rouge_use_stemmer=True,
@@ -185,6 +186,16 @@ def test_EvaluationParameters(llm_api_params):
             ],
         )
 
+    # BLEU smoothing function name must be a valid option. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options.
+    with pytest.raises(ValidationError):
+        schemas.EvaluationParameters(
+            task_type=enums.TaskType.TEXT_GENERATION,
+            metrics_to_return=[
+                MetricType.BLEU,
+            ],
+            bleu_smoothing_function="invalid_smoothing_function_name",
+        )
+
     # BLEU weights must be 0 <= weight <= 1.
     with pytest.raises(ValidationError):
         schemas.EvaluationParameters(
diff --git a/api/valor_api/backend/metrics/text_generation.py b/api/valor_api/backend/metrics/text_generation.py
index 7e4b14af0..2bb5f6a0c 100644
--- a/api/valor_api/backend/metrics/text_generation.py
+++ b/api/valor_api/backend/metrics/text_generation.py
@@ -133,6 +133,7 @@ def _calculate_rouge_scores(
 def _calculate_sentence_bleu(
     predictions: str | list[str],
     references: list[str] | list[list[str]],
+    smoothing_function: str | None = None,
     weights: list[float] = [0.25, 0.25, 0.25, 0.25],
 ) -> list[dict[str, float]]:
     """
@@ -144,6 +145,8 @@ def _calculate_sentence_bleu(
         The predictions to score. Each prediction should be a string with tokens separated by spaces.
     references: list[str] | list[list[str]
         A list of reference for each prediction or a list of several references per prediction. Each reference should be a string with tokens separated by spaces.
+    smoothing_function: str, optional
+        The method name of the smoothing function to use. Defaults to None. If None, then no smoothing will be used. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options.
     weights: list[float]
         The default BLEU calculates a score for up to 4-grams using uniform
         weights (this is called BLEU-4). To evaluate your translations with
@@ -176,6 +179,11 @@ def _calculate_sentence_bleu(
             "prediction should be a str or list[str]. If prediction is a list[str], then references must be a list of lists."
         )
 
+    if smoothing_function is not None:
+        smoothing_function = getattr(
+            bleu_score.SmoothingFunction(), smoothing_function
+        )
+
     output = defaultdict(float)
     tokenizer = RegexpTokenizer(
         r"\w+|\$[\d]+|[^\s\.]+"
@@ -192,6 +200,7 @@ def _calculate_sentence_bleu(
                 bleu_score.sentence_bleu(
                     references=tokenized_references,
                     hypothesis=tokenized_prediction,
+                    smoothing_function=smoothing_function,
                     weights=weights,
                 ),  # type: ignore
             ),
@@ -436,10 +445,14 @@ def _compute_text_generation_metrics(
                 bleu_params = metric_params.get("BLEU", {})
                 if not isinstance(bleu_params, dict):
                     raise ValueError("BLEU parameters must be a dictionary.")
+                smoothing_function = bleu_params.get(
+                    "smoothing_function", None
+                )
                 weights = bleu_params.get("weights", [0.25, 0.25, 0.25, 0.25])
                 bleu_metrics = _calculate_sentence_bleu(
                     predictions=predictions,
                     references=references,
+                    smoothing_function=smoothing_function,
                     weights=weights,
                 )
 
@@ -731,6 +744,12 @@ def compute_text_generation_metrics(
     )
 
     metric_params = {}
+    if parameters.bleu_smoothing_function is not None:
+        if "BLEU" not in metric_params:
+            metric_params["BLEU"] = {}
+        metric_params["BLEU"][
+            "smoothing_function"
+        ] = parameters.bleu_smoothing_function
     if parameters.bleu_weights is not None:
         if "BLEU" not in metric_params:
             metric_params["BLEU"] = {}
diff --git a/api/valor_api/schemas/evaluation.py b/api/valor_api/schemas/evaluation.py
index 76a614904..99316a814 100644
--- a/api/valor_api/schemas/evaluation.py
+++ b/api/valor_api/schemas/evaluation.py
@@ -1,5 +1,6 @@
 import datetime
 
+from nltk.translate import bleu_score
 from pydantic import BaseModel, ConfigDict, field_validator, model_validator
 
 from valor_api.enums import (
@@ -43,6 +44,8 @@ class EvaluationParameters(BaseModel):
           The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5.
     pr_curve_max_examples: int
         The maximum number of datum examples to store when calculating PR curves.
+    bleu_smoothing_function: str, optional
+        The method name of the smoothing function to use when calculating BLEU scores. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options.
     bleu_weights: list[float], optional
         The weights to use when calculating BLEU scores.
     rouge_types: list[ROUGEType]
@@ -62,6 +65,7 @@ class EvaluationParameters(BaseModel):
     recall_score_threshold: float | None = 0
     pr_curve_iou_threshold: float = 0.5
     pr_curve_max_examples: int = 1
+    bleu_smoothing_function: str | None = None
     bleu_weights: list[float] | None = None
     rouge_types: list[ROUGEType] | None = None
     rouge_use_stemmer: bool | None = None
@@ -184,6 +188,15 @@ def _validate_parameters(cls, values):
                             "`llm_api_params` must be provided for LLM guided evaluations."
                         )
 
+                if values.bleu_smoothing_function is not None:
+                    if not hasattr(
+                        bleu_score.SmoothingFunction,
+                        values.bleu_smoothing_function,
+                    ):
+                        raise ValueError(
+                            f"BLEU smoothing function `{values.bleu_smoothing_function}` is not supported."
+                        )
+
                 if values.bleu_weights is not None:
                     if not all(
                         isinstance(weight, (int, float)) and 0 <= weight
diff --git a/client/valor/coretypes.py b/client/valor/coretypes.py
index 676133213..5dfab92fa 100644
--- a/client/valor/coretypes.py
+++ b/client/valor/coretypes.py
@@ -1115,6 +1115,7 @@ def evaluate_text_generation(
                 if api_key is not None:
                     llm_api_params["api_key"] = api_key
 
+        bleu_smoothing_function = None
         bleu_weights = None
         rouge_types = None
         rouge_use_stemmer = None
@@ -1127,6 +1128,9 @@ def evaluate_text_generation(
                 )
 
             if MetricType.BLEU in metric_params:
+                bleu_smoothing_function = metric_params[MetricType.BLEU].get(
+                    "smoothing_function"
+                )
                 bleu_weights = metric_params[MetricType.BLEU].get("weights")
 
             if MetricType.ROUGE in metric_params:
@@ -1148,6 +1152,7 @@ def evaluate_text_generation(
                 task_type=TaskType.TEXT_GENERATION,
                 metrics_to_return=metrics_to_return,
                 llm_api_params=llm_api_params,
+                bleu_smoothing_function=bleu_smoothing_function,
                 bleu_weights=bleu_weights,
                 rouge_types=rouge_types,
                 rouge_use_stemmer=rouge_use_stemmer,
diff --git a/client/valor/schemas/evaluation.py b/client/valor/schemas/evaluation.py
index 77c7a0522..f1bf52845 100644
--- a/client/valor/schemas/evaluation.py
+++ b/client/valor/schemas/evaluation.py
@@ -32,6 +32,8 @@ class EvaluationParameters:
         The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5.
     pr_curve_max_examples: int
         The maximum number of datum examples to store when calculating PR curves.
+    bleu_smoothing_function: str, optional
+        The method name of the smoothing function to use when calculating BLEU scores. See https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py for options.
     bleu_weights: list[float], optional
         The weights to use when calculating BLEU scores.
     rouge_types: list[ROUGEType]
@@ -51,6 +53,7 @@ class EvaluationParameters:
     recall_score_threshold: float = 0
     pr_curve_iou_threshold: float = 0.5
     pr_curve_max_examples: int = 1
+    bleu_smoothing_function: Optional[str] = None
     bleu_weights: Optional[List[float]] = None
     rouge_types: Optional[List[ROUGEType]] = None
     rouge_use_stemmer: Optional[bool] = None
diff --git a/docs/metrics.md b/docs/metrics.md
index 5d61e70ac..da1e2126c 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -576,4 +576,4 @@ Uses
 
 BLEU (BiLingual Evaluation Understudy) is an algorithm for evaluating automatic summarization and machine translation software in natural language processing. BLEU's output is always a number between 0 and 1, where a score near 1 indicates that the hypothesis text is very similar to one or more of the reference texts.
 
-Behind the scenes, we use [nltk.translate.bleu_score](https://www.nltk.org/_modules/nltk/translate/bleu_score.html) to calculate these scores. The default BLEU metric calculates a score for up to 4-grams using uniform weights (i.e., `weights=[.25, .25, .25, .25]`; also called BLEU-4). Users can pass their own `bleu_weights` to EvaluationParameters in order to change this default behavior and calculate other BLEU scores.
\ No newline at end of file
+Behind the scenes, we use [nltk.translate.bleu_score](https://www.nltk.org/_modules/nltk/translate/bleu_score.html) to calculate these scores. The default BLEU metric calculates a score for up to 4-grams using uniform weights (i.e., `weights=[.25, .25, .25, .25]`; also called BLEU-4). Users can pass their own `bleu_weights` to EvaluationParameters in order to change this default behavior and calculate other BLEU scores. Users can also specify `bleu_smoothing_function` to use a different smoothing function.
\ No newline at end of file
diff --git a/integration_tests/client/datatype/test_data_generation.py b/integration_tests/client/datatype/test_data_generation.py
index 026d1e78e..beed0eac5 100644
--- a/integration_tests/client/datatype/test_data_generation.py
+++ b/integration_tests/client/datatype/test_data_generation.py
@@ -419,6 +419,7 @@ def test_generate_prediction_data(client: Client):
             ],
             "pr_curve_iou_threshold": 0.5,
             "pr_curve_max_examples": 1,
+            "bleu_smoothing_function": None,
             "bleu_weights": None,
             "rouge_types": None,
             "rouge_use_stemmer": None,
diff --git a/integration_tests/client/metrics/test_detection.py b/integration_tests/client/metrics/test_detection.py
index 780be8616..f6cc34fdd 100644
--- a/integration_tests/client/metrics/test_detection.py
+++ b/integration_tests/client/metrics/test_detection.py
@@ -168,6 +168,7 @@ def test_evaluate_detection(
             ],
             "pr_curve_iou_threshold": 0.5,
             "pr_curve_max_examples": 1,
+            "bleu_smoothing_function": None,
             "bleu_weights": None,
             "rouge_types": None,
             "rouge_use_stemmer": None,
@@ -331,6 +332,7 @@ def test_evaluate_detection(
             ],
             "pr_curve_iou_threshold": 0.5,
             "pr_curve_max_examples": 1,
+            "bleu_smoothing_function": None,
             "bleu_weights": None,
             "rouge_types": None,
             "rouge_use_stemmer": None,
@@ -413,6 +415,7 @@ def test_evaluate_detection(
             ],
             "pr_curve_iou_threshold": 0.5,
             "pr_curve_max_examples": 1,
+            "bleu_smoothing_function": None,
             "bleu_weights": None,
             "rouge_types": None,
             "rouge_use_stemmer": None,
@@ -527,6 +530,7 @@ def test_evaluate_detection(
             ],
             "pr_curve_iou_threshold": 0.5,
             "pr_curve_max_examples": 1,
+            "bleu_smoothing_function": None,
             "bleu_weights": None,
             "rouge_types": None,
             "rouge_use_stemmer": None,
@@ -755,6 +759,7 @@ def test_evaluate_detection_with_json_filters(
             ],
             "pr_curve_iou_threshold": 0.5,
             "pr_curve_max_examples": 1,
+            "bleu_smoothing_function": None,
             "bleu_weights": None,
             "rouge_types": None,
             "rouge_use_stemmer": None,
@@ -2036,6 +2041,7 @@ def test_evaluate_detection_with_label_maps(
         ],
         "pr_curve_iou_threshold": 0.5,
         "pr_curve_max_examples": 1,
+        "bleu_smoothing_function": None,
         "bleu_weights": None,
         "rouge_types": None,
         "rouge_use_stemmer": None,
diff --git a/integration_tests/client/metrics/test_text_generation_with_mock_client.py b/integration_tests/client/metrics/test_text_generation_with_mock_client.py
index 4f5278ca9..d47761be5 100644
--- a/integration_tests/client/metrics/test_text_generation_with_mock_client.py
+++ b/integration_tests/client/metrics/test_text_generation_with_mock_client.py
@@ -249,6 +249,7 @@ def test_llm_evaluation_rag_with_mock_client(
         },
         metric_params={
             MetricType.BLEU: {
+                "smoothing_function": "method0",
                 "weights": [0.25, 0.25, 0.25, 0.25],
             },
             MetricType.ROUGE: {
@@ -355,6 +356,31 @@ def test_llm_evaluation_rag_with_mock_client(
             expected_metrics[uid][metric_name] == m["value"]
         ), f"Failed for {uid} and {metric_name}"
 
+    # # Test different settings for metric params
+    # eval_job = model.evaluate_text_generation(
+    #     datasets=dataset,
+    #     metrics_to_return=metrics_to_return,
+    #     llm_api_params={
+    #         "client": "mock",
+    #         "data": {
+    #             "model": "model",
+    #         },
+    #     },
+    #     metric_params={
+    #         MetricType.BLEU: {
+    #             "smoothing_function": "method3",
+    #         },
+    #         MetricType.ROUGE: {
+    #             "rouge_types": [
+    #                 ROUGEType.ROUGE1,
+    #                 ROUGEType.ROUGE2,
+    #                 ROUGEType.ROUGEL,
+    #             ],
+    #             "use_stemmer": True,
+    #         },
+    #     },
+    # )
+
     # Must only specify text generation metrics.
     with pytest.raises(ValueError):
         eval_job = model.evaluate_text_generation(