add neg class score for binary precision, recall, f1 and max f1 (#788)

* add perplexity with Mistral_7B_Instruct_v2 Signed-off-by: lilacheden <[email protected]> * add perplexity test Signed-off-by: lilacheden <[email protected]> * avoid test memory issues Signed-off-by: lilacheden <[email protected]> * round traversed thresholds for max f1 to 3 Signed-off-by: lilacheden <[email protected]> * processor for extracting model prediction enclosed in double brackets Signed-off-by: Ariel Gera <[email protected]> * fix syntax error Signed-off-by: lilacheden <[email protected]> * fix code formatting Signed-off-by: lilacheden <[email protected]> * add modified metrics to catalog Signed-off-by: lilacheden <[email protected]> * change template name Signed-off-by: lilacheden <[email protected]> * remove perplexity with mistral Signed-off-by: lilacheden <[email protected]> * add neg class score for binary precision, recall, f1 and max f1 Signed-off-by: lilacheden <[email protected]> * allow spearman metric to receive str inputs Signed-off-by: lilacheden <[email protected]> * support running AbstractLM on local Apple GPU Signed-off-by: lilacheden <[email protected]> * Support Unions in metric prediction_type Signed-off-by: lilacheden <[email protected]> * add processor cast_to_float_return_nan_if_failed Signed-off-by: lilacheden <[email protected]> * Make prediction_type of metrics numeric (float or int) Modified Metrics: "metrics.kendalltau_b", "metrics.roc_auc","metrics.f1_binary","metrics.accuracy_binary", "metrics.precision_binary", "metrics.recall_binary", "metrics.max_f1_binary", "metrics.max_accuracy_binary", Signed-off-by: lilacheden <[email protected]> * Adjust coedit task and template to metrics change Signed-off-by: lilacheden <[email protected]> * Adjust test_task_metrics_type_checking to metrics change Signed-off-by: lilacheden <[email protected]> * Adjust test_artifact_loading_with_overwrite_args_list to metrics change Signed-off-by: lilacheden <[email protected]> * revert support running AbstractLM on local Apple GPU Signed-off-by: lilacheden <[email protected]> --------- Signed-off-by: lilacheden <[email protected]> Signed-off-by: Ariel Gera <[email protected]> Co-authored-by: Ariel Gera <[email protected]> Co-authored-by: Elron Bandel <[email protected]>
IBM · May 5, 2024 · 4775531 · 4775531
1 parent 6cade91
commit 4775531
Show file tree

Hide file tree

Showing 13 changed files with 186 additions and 84 deletions.
diff --git a/prepare/metrics/kendalltau.py b/prepare/metrics/kendalltau.py
@@ -5,8 +5,8 @@
 
 metric = KendallTauMetric()
 
-predictions = ["1.0", "2.0", "1.0"]
-references = [["-1.0"], ["1.0"], ["0.0"]]
+predictions = [1.0, 2.0, 1.0]
+references = [[-1.0], [1.0], [0.0]]
 
 instance_targets = [
     {

diff --git a/prepare/metrics/roc_auc.py b/prepare/metrics/roc_auc.py
@@ -5,8 +5,8 @@
 
 metric = RocAuc()
 
-predictions = ["0.2", "0.8", "1.0"]
-references = [["1.0"], ["0.0"], ["1.0"]]
+predictions = [0.2, 0.8, 1.0]
+references = [[1.0], [0.0], [1.0]]
 
 instance_targets = [{"roc_auc": np.nan, "score": np.nan, "score_name": "roc_auc"}] * 3
 global_targets = {

diff --git a/prepare/processors/processors.py b/prepare/processors/processors.py
@@ -1,3 +1,4 @@
+import numpy as np
 from unitxt import add_to_catalog
 from unitxt.logging_utils import get_logger
 from unitxt.operator import SequentialOperator
@@ -296,6 +297,22 @@
     overwrite=True,
 )
 
+add_to_catalog(
+    SequentialOperator(
+        steps=[
+            CastFields(
+                fields={"prediction": "float"},
+                failure_defaults={"prediction": np.nan},
+            ),
+            CastFields(
+                fields={"references": "float"},
+                process_every_value=True,
+            ),
+        ]
+    ),
+    "processors.cast_to_float_return_nan_if_failed",
+    overwrite=True,
+)
 
 add_to_catalog(
     SequentialOperator(

diff --git a/prepare/tasks/classification.py b/prepare/tasks/classification.py
@@ -21,7 +21,7 @@
     FormTask(
         inputs={"text": "str", "text_type": "str", "class": "str"},
         outputs={"class": "str", "label": "int"},
-        prediction_type="str",
+        prediction_type="float",
         metrics=[
             "metrics.accuracy",
             "metrics.f1_binary",

diff --git a/prepare/templates/classification/grammatical_error_detection.py b/prepare/templates/classification/grammatical_error_detection.py
@@ -11,6 +11,7 @@
             "processors.take_first_word",
             "processors.lower_case",
             "processors.yes_no_to_int",
+            "processors.cast_to_float_return_nan_if_failed",
         ],
     ),
     "templates.grammatical_error_detection.yes_no",

diff --git a/src/unitxt/catalog/processors/cast_to_float_return_nan_if_failed.json b/src/unitxt/catalog/processors/cast_to_float_return_nan_if_failed.json
@@ -0,0 +1,21 @@
+{
+    "type": "sequential_operator",
+    "steps": [
+        {
+            "type": "cast_fields",
+            "fields": {
+                "prediction": "float"
+            },
+            "failure_defaults": {
+                "prediction": NaN
+            }
+        },
+        {
+            "type": "cast_fields",
+            "fields": {
+                "references": "float"
+            },
+            "process_every_value": true
+        }
+    ]
+}
diff --git a/src/unitxt/catalog/tasks/classification/binary/zero_or_one.json b/src/unitxt/catalog/tasks/classification/binary/zero_or_one.json
@@ -9,7 +9,7 @@
         "class": "str",
         "label": "int"
     },
-    "prediction_type": "str",
+    "prediction_type": "float",
     "metrics": [
         "metrics.accuracy",
         "metrics.f1_binary"

diff --git a/src/unitxt/catalog/templates/grammatical_error_detection/yes_no.json b/src/unitxt/catalog/templates/grammatical_error_detection/yes_no.json
@@ -7,6 +7,7 @@
     "postprocessors": [
         "processors.take_first_word",
         "processors.lower_case",
-        "processors.yes_no_to_int"
+        "processors.yes_no_to_int",
+        "processors.cast_to_float_return_nan_if_failed"
     ]
 }
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -29,7 +29,7 @@
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
-from .type_utils import isoftype, parse_type_string, to_float_or_default
+from .type_utils import isoftype, parse_type_string
 
 logger = get_logger()
 settings = get_settings()
@@ -1261,30 +1261,50 @@ class F1Micro(F1):
     average = "micro"
 
 
-class F1Binary(F1):
+class F1Binary(GlobalMetric):
     """Calculate f1 for a binary task, using 0.5 as the threshold in the case of float predictions."""
 
     process_single_instances = False
     main_score = "f1_binary"
-    average = "binary"
-    pos_classes = {"1", "1.0", "yes", "true"}
+    average = None
     threshold = 0.5
+    prediction_type = "Union[float, int]"
+    _metric = None
+    metric = "f1"
+    single_reference_per_prediction = True
 
-    def get_str_id(self, str):
-        return int(str)
+    def prepare(self):
+        super().prepare()
+        self._metric = evaluate.load(self.metric)
+
+    def _validate_reference(self, reference):
+        super()._validate_reference(reference)
+        assert reference[0] in [
+            0,
+            1,
+        ], f"all references of {self.main_score} must by 0 or 1"
 
     def compute(
         self,
         references: List[List[str]],
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
-        predictions_floats = [to_float_or_default(p) for p in predictions]
-        predictions = [str(int(p > self.threshold)) for p in predictions_floats]
-        references = [
-            ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
-        ]
-        return super().compute(references, predictions, task_data)
+        flattened_int_references = [int(r[0]) for r in references]
+        int_predictions = [int(p > self.threshold) for p in predictions]
+
+        result = self._metric.compute(
+            references=flattened_int_references,
+            predictions=int_predictions,
+            labels=[0, 1],
+            average=self.average,
+        )
+        if isinstance(result[self.metric], numpy.ndarray):
+            return {
+                self.main_score: result[self.metric][1],
+                f"{self.main_score}_neg": result[self.metric][0],
+            }
+        return {self.main_score: result[self.metric]}
 
 
 class RecallBinary(F1Binary):
@@ -1538,7 +1558,7 @@ class KendallTauMetric(GlobalMetric):
     main_score = "kendalltau_b"
     variant = "b"
     process_single_instances = False
-    prediction_type = "str"
+    prediction_type = "float"
 
     _requirements_list: List[str] = ["scipy"]
 
@@ -1555,8 +1575,6 @@ def compute(
     ) -> dict:
         if isinstance(references[0], list):
             references = [reference[0] for reference in references]
-        references = [to_float_or_default(r) for r in references]
-        predictions = [to_float_or_default(p) for p in predictions]
 
         kendall_results = self.kendalltau(references, predictions, variant=self.variant)
         corr = kendall_results.correlation
@@ -1602,7 +1620,7 @@ class RocAuc(GlobalMetric):
     process_single_instances = False
     _requirements_list: List[str] = ["sklearn"]
     single_reference_per_prediction = True
-    prediction_type = "str"
+    prediction_type = "float"
 
     def prepare(self):
         from sklearn import metrics
@@ -1618,8 +1636,6 @@ def compute(
     ) -> dict:
         if isinstance(references[0], list):
             references = [reference[0] for reference in references]
-        references = [to_float_or_default(r) for r in references]
-        predictions = [to_float_or_default(p) for p in predictions]
 
         false_positive_rates, true_positive_rates, _ = self.roc_curve(
             y_true=references, y_score=predictions
@@ -3337,33 +3353,42 @@ class BinaryMaxF1(F1Binary):
     """Calculate the maximal F1 and the decision threshold that achieves it for a binary task with float predictions."""
 
     main_score = "max_f1_binary"
-    prediction_type = str
     single_reference_per_prediction = True
 
     def compute(
         self,
-        references: List[List[str]],
-        predictions: List[List[str]],
+        references: List[List[float]],
+        predictions: List[List[float]],
         task_data: List[Dict],
     ) -> dict:
-        float_predictions = [to_float_or_default(p) for p in predictions]
-
         best_thr = -1
         best_f1 = -1
-        thrs = {round(fp, 3) for fp in float_predictions}
+        best_thr_neg = -1
+        best_f1_neg = -1
+        thrs = {round(fp, 3) for fp in predictions}
         for thr in thrs:
             new_predictions = [
-                "1" if float_prediction >= thr else "0"
-                for float_prediction in float_predictions
-            ]
-            f1 = super().compute(references, new_predictions, task_data)[
-                self.main_score
+                1.0 if float_prediction >= thr else 0.0
+                for float_prediction in predictions
             ]
+            f1_results = super().compute(references, new_predictions, task_data)
+
+            f1 = f1_results[self.main_score]
             if f1 > best_f1:
                 best_f1 = f1
                 best_thr = thr
 
-        return {self.main_score: best_f1, "best_thr_maxf1": best_thr}
+            f1_neg = f1_results[f"{self.main_score}_neg"]
+            if f1_neg > best_f1_neg:
+                best_f1_neg = f1_neg
+                best_thr_neg = thr
+
+        return {
+            self.main_score: best_f1,
+            "best_thr_maxf1": best_thr,
+            f"{self.main_score}_neg": best_f1_neg,
+            "best_thr_maxf1_neg": best_thr_neg,
+        }
 
 
 class BinaryAccuracy(InstanceMetric):
@@ -3372,20 +3397,25 @@ class BinaryAccuracy(InstanceMetric):
     reduction_map = {"mean": ["accuracy_binary"]}
     main_score = "accuracy_binary"
     ci_scores = ["accuracy_binary"]
-    pos_classes = {"1", "1.0", "yes", "true"}
     threshold = 0.5
 
-    prediction_type = "str"
+    prediction_type = "Union[float,int]"
     single_reference_per_prediction = True
 
+    def _validate_reference(self, reference):
+        super()._validate_reference(reference)
+        assert reference[0] in [
+            0,
+            1,
+        ], f"all references of {self.main_score} must by 0 or 1"
+
     def compute(
-        self, references: List[Any], prediction: Any, task_data: List[Dict]
+        self, references: List[float], prediction: float, task_data: List[Dict]
     ) -> dict:
-        float_prediction = to_float_or_default(prediction)
-        prediction = str(int(float_prediction > self.threshold))
-        references = ["1"] if references[0].lower() in self.pos_classes else ["0"]
+        prediction = int(prediction > self.threshold)
+        reference = int(references[0])
 
-        result = {self.main_score: float([prediction] == references)}
+        result = {self.main_score: float(prediction == reference)}
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score
         return result
@@ -3396,9 +3426,7 @@ class BinaryMaxAccuracy(GlobalMetric):
 
     process_single_instances = False
     main_score = "max_accuracy_binary"
-    pos_classes = {"1", "1.0", "yes", "true"}
-
-    prediction_type = "str"
+    prediction_type = "Union[float,int]"
     single_reference_per_prediction = True
 
     def compute(
@@ -3407,10 +3435,7 @@ def compute(
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
-        float_predictions = [to_float_or_default(p) for p in predictions]
-        references = [
-            ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
-        ]
+        references = [[int(r[0])] for r in references]
 
         # Sticking to the test >= thr, accuracy induced by threshold thr is the number of float predictions
         # that pass the test (are >= thr) and are paired with reference "1" plus the number of float predictions that
@@ -3421,8 +3446,8 @@ def compute(
         # the largest float predictions, to induce the partition into all-failing , none-passing.
 
         fp = [
-            (float_predictions[i], i, -1 if references[i][0] == "1" else +1)
-            for i in range(len(float_predictions))
+            (predictions[i], i, -1 if references[i][0] == 1 else +1)
+            for i in range(len(predictions))
         ]
         fp.sort()
         # each triplet above: float-prediction f; f's ordinal position in float_predictions, which is also
@@ -3436,7 +3461,7 @@ def compute(
 
         current_thr = fp[0][0]
         # partition float_predictions into all-passing, none-failing
-        current_acc = sum(r[0] == "1" for r in references)
+        current_acc = sum(r[0] == 1 for r in references)
         # number of predictions that thr sends to the reference they are paired with
 
         best_acc = current_acc

diff --git a/src/unitxt/task.py b/src/unitxt/task.py
@@ -3,7 +3,13 @@
 from .artifact import fetch_artifact
 from .logging_utils import get_logger
 from .operator import StreamInstanceOperator
-from .type_utils import isoftype, parse_type_string, verify_required_schema
+from .type_utils import (
+    get_args,
+    get_origin,
+    isoftype,
+    parse_type_string,
+    verify_required_schema,
+)
 
 
 class Tasker:
@@ -79,6 +85,10 @@ def check_metrics_type(self) -> None:
                 prediction_type == metric_prediction_type
                 or prediction_type == Any
                 or metric_prediction_type == Any
+                or (
+                    get_origin(metric_prediction_type) is Union
+                    and prediction_type in get_args(metric_prediction_type)
+                )
             ):
                 continue
 

diff --git a/tests/library/test_artifact.py b/tests/library/test_artifact.py
@@ -68,9 +68,9 @@ def test_artifact_loading_with_overwrite_args_with_list_of_operators(self):
             self.assertEqual(artifact.steps[0].string, "no")
 
     def test_artifact_loading_with_overwrite_args_list(self):
-        artifact_identifier = "tasks.classification.binary.zero_or_one[metrics=[metrics.rouge, metrics.accuracy]]"
+        artifact_identifier = "tasks.classification.binary.zero_or_one[metrics=[metrics.roc_auc, metrics.accuracy]]"
         artifact, _ = fetch_artifact(artifact_identifier)
-        self.assertEqual(artifact.metrics, ["metrics.rouge", "metrics.accuracy"])
+        self.assertEqual(artifact.metrics, ["metrics.roc_auc", "metrics.accuracy"])
 
     def test_artifact_loading_with_overwrite_args_dict(self):
         with temp_catalog() as catalog_path: