diff --git a/prepare/metrics/kendalltau.py b/prepare/metrics/kendalltau.py index 8f7d5e614..c0576f5d7 100644 --- a/prepare/metrics/kendalltau.py +++ b/prepare/metrics/kendalltau.py @@ -5,8 +5,8 @@ metric = KendallTauMetric() -predictions = ["1.0", "2.0", "1.0"] -references = [["-1.0"], ["1.0"], ["0.0"]] +predictions = [1.0, 2.0, 1.0] +references = [[-1.0], [1.0], [0.0]] instance_targets = [ { diff --git a/prepare/metrics/roc_auc.py b/prepare/metrics/roc_auc.py index 72a139259..7a615caeb 100644 --- a/prepare/metrics/roc_auc.py +++ b/prepare/metrics/roc_auc.py @@ -5,8 +5,8 @@ metric = RocAuc() -predictions = ["0.2", "0.8", "1.0"] -references = [["1.0"], ["0.0"], ["1.0"]] +predictions = [0.2, 0.8, 1.0] +references = [[1.0], [0.0], [1.0]] instance_targets = [{"roc_auc": np.nan, "score": np.nan, "score_name": "roc_auc"}] * 3 global_targets = { diff --git a/prepare/processors/processors.py b/prepare/processors/processors.py index 0be7ae2f0..be9ec9e07 100644 --- a/prepare/processors/processors.py +++ b/prepare/processors/processors.py @@ -1,3 +1,4 @@ +import numpy as np from unitxt import add_to_catalog from unitxt.logging_utils import get_logger from unitxt.operator import SequentialOperator @@ -296,6 +297,22 @@ overwrite=True, ) +add_to_catalog( + SequentialOperator( + steps=[ + CastFields( + fields={"prediction": "float"}, + failure_defaults={"prediction": np.nan}, + ), + CastFields( + fields={"references": "float"}, + process_every_value=True, + ), + ] + ), + "processors.cast_to_float_return_nan_if_failed", + overwrite=True, +) add_to_catalog( SequentialOperator( diff --git a/prepare/tasks/classification.py b/prepare/tasks/classification.py index 6be65d89f..100694695 100644 --- a/prepare/tasks/classification.py +++ b/prepare/tasks/classification.py @@ -21,7 +21,7 @@ FormTask( inputs={"text": "str", "text_type": "str", "class": "str"}, outputs={"class": "str", "label": "int"}, - prediction_type="str", + prediction_type="float", metrics=[ "metrics.accuracy", "metrics.f1_binary", diff --git a/prepare/templates/classification/grammatical_error_detection.py b/prepare/templates/classification/grammatical_error_detection.py index 933e54ef9..fca5b935f 100644 --- a/prepare/templates/classification/grammatical_error_detection.py +++ b/prepare/templates/classification/grammatical_error_detection.py @@ -11,6 +11,7 @@ "processors.take_first_word", "processors.lower_case", "processors.yes_no_to_int", + "processors.cast_to_float_return_nan_if_failed", ], ), "templates.grammatical_error_detection.yes_no", diff --git a/src/unitxt/catalog/processors/cast_to_float_return_nan_if_failed.json b/src/unitxt/catalog/processors/cast_to_float_return_nan_if_failed.json new file mode 100644 index 000000000..136b32102 --- /dev/null +++ b/src/unitxt/catalog/processors/cast_to_float_return_nan_if_failed.json @@ -0,0 +1,21 @@ +{ + "type": "sequential_operator", + "steps": [ + { + "type": "cast_fields", + "fields": { + "prediction": "float" + }, + "failure_defaults": { + "prediction": NaN + } + }, + { + "type": "cast_fields", + "fields": { + "references": "float" + }, + "process_every_value": true + } + ] +} diff --git a/src/unitxt/catalog/tasks/classification/binary/zero_or_one.json b/src/unitxt/catalog/tasks/classification/binary/zero_or_one.json index f706904c7..fbe2c8566 100644 --- a/src/unitxt/catalog/tasks/classification/binary/zero_or_one.json +++ b/src/unitxt/catalog/tasks/classification/binary/zero_or_one.json @@ -9,7 +9,7 @@ "class": "str", "label": "int" }, - "prediction_type": "str", + "prediction_type": "float", "metrics": [ "metrics.accuracy", "metrics.f1_binary" diff --git a/src/unitxt/catalog/templates/grammatical_error_detection/yes_no.json b/src/unitxt/catalog/templates/grammatical_error_detection/yes_no.json index 8f98abbe7..518f1a1c9 100644 --- a/src/unitxt/catalog/templates/grammatical_error_detection/yes_no.json +++ b/src/unitxt/catalog/templates/grammatical_error_detection/yes_no.json @@ -7,6 +7,7 @@ "postprocessors": [ "processors.take_first_word", "processors.lower_case", - "processors.yes_no_to_int" + "processors.yes_no_to_int", + "processors.cast_to_float_return_nan_if_failed" ] } diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index b46907d27..f1295169c 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -29,7 +29,7 @@ from .random_utils import get_seed from .settings_utils import get_settings from .stream import MultiStream, Stream -from .type_utils import isoftype, parse_type_string, to_float_or_default +from .type_utils import isoftype, parse_type_string logger = get_logger() settings = get_settings() @@ -1261,17 +1261,28 @@ class F1Micro(F1): average = "micro" -class F1Binary(F1): +class F1Binary(GlobalMetric): """Calculate f1 for a binary task, using 0.5 as the threshold in the case of float predictions.""" process_single_instances = False main_score = "f1_binary" - average = "binary" - pos_classes = {"1", "1.0", "yes", "true"} + average = None threshold = 0.5 + prediction_type = "Union[float, int]" + _metric = None + metric = "f1" + single_reference_per_prediction = True - def get_str_id(self, str): - return int(str) + def prepare(self): + super().prepare() + self._metric = evaluate.load(self.metric) + + def _validate_reference(self, reference): + super()._validate_reference(reference) + assert reference[0] in [ + 0, + 1, + ], f"all references of {self.main_score} must by 0 or 1" def compute( self, @@ -1279,12 +1290,21 @@ def compute( predictions: List[str], task_data: List[Dict], ) -> dict: - predictions_floats = [to_float_or_default(p) for p in predictions] - predictions = [str(int(p > self.threshold)) for p in predictions_floats] - references = [ - ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references - ] - return super().compute(references, predictions, task_data) + flattened_int_references = [int(r[0]) for r in references] + int_predictions = [int(p > self.threshold) for p in predictions] + + result = self._metric.compute( + references=flattened_int_references, + predictions=int_predictions, + labels=[0, 1], + average=self.average, + ) + if isinstance(result[self.metric], numpy.ndarray): + return { + self.main_score: result[self.metric][1], + f"{self.main_score}_neg": result[self.metric][0], + } + return {self.main_score: result[self.metric]} class RecallBinary(F1Binary): @@ -1538,7 +1558,7 @@ class KendallTauMetric(GlobalMetric): main_score = "kendalltau_b" variant = "b" process_single_instances = False - prediction_type = "str" + prediction_type = "float" _requirements_list: List[str] = ["scipy"] @@ -1555,8 +1575,6 @@ def compute( ) -> dict: if isinstance(references[0], list): references = [reference[0] for reference in references] - references = [to_float_or_default(r) for r in references] - predictions = [to_float_or_default(p) for p in predictions] kendall_results = self.kendalltau(references, predictions, variant=self.variant) corr = kendall_results.correlation @@ -1602,7 +1620,7 @@ class RocAuc(GlobalMetric): process_single_instances = False _requirements_list: List[str] = ["sklearn"] single_reference_per_prediction = True - prediction_type = "str" + prediction_type = "float" def prepare(self): from sklearn import metrics @@ -1618,8 +1636,6 @@ def compute( ) -> dict: if isinstance(references[0], list): references = [reference[0] for reference in references] - references = [to_float_or_default(r) for r in references] - predictions = [to_float_or_default(p) for p in predictions] false_positive_rates, true_positive_rates, _ = self.roc_curve( y_true=references, y_score=predictions @@ -3337,33 +3353,42 @@ class BinaryMaxF1(F1Binary): """Calculate the maximal F1 and the decision threshold that achieves it for a binary task with float predictions.""" main_score = "max_f1_binary" - prediction_type = str single_reference_per_prediction = True def compute( self, - references: List[List[str]], - predictions: List[List[str]], + references: List[List[float]], + predictions: List[List[float]], task_data: List[Dict], ) -> dict: - float_predictions = [to_float_or_default(p) for p in predictions] - best_thr = -1 best_f1 = -1 - thrs = {round(fp, 3) for fp in float_predictions} + best_thr_neg = -1 + best_f1_neg = -1 + thrs = {round(fp, 3) for fp in predictions} for thr in thrs: new_predictions = [ - "1" if float_prediction >= thr else "0" - for float_prediction in float_predictions - ] - f1 = super().compute(references, new_predictions, task_data)[ - self.main_score + 1.0 if float_prediction >= thr else 0.0 + for float_prediction in predictions ] + f1_results = super().compute(references, new_predictions, task_data) + + f1 = f1_results[self.main_score] if f1 > best_f1: best_f1 = f1 best_thr = thr - return {self.main_score: best_f1, "best_thr_maxf1": best_thr} + f1_neg = f1_results[f"{self.main_score}_neg"] + if f1_neg > best_f1_neg: + best_f1_neg = f1_neg + best_thr_neg = thr + + return { + self.main_score: best_f1, + "best_thr_maxf1": best_thr, + f"{self.main_score}_neg": best_f1_neg, + "best_thr_maxf1_neg": best_thr_neg, + } class BinaryAccuracy(InstanceMetric): @@ -3372,20 +3397,25 @@ class BinaryAccuracy(InstanceMetric): reduction_map = {"mean": ["accuracy_binary"]} main_score = "accuracy_binary" ci_scores = ["accuracy_binary"] - pos_classes = {"1", "1.0", "yes", "true"} threshold = 0.5 - prediction_type = "str" + prediction_type = "Union[float,int]" single_reference_per_prediction = True + def _validate_reference(self, reference): + super()._validate_reference(reference) + assert reference[0] in [ + 0, + 1, + ], f"all references of {self.main_score} must by 0 or 1" + def compute( - self, references: List[Any], prediction: Any, task_data: List[Dict] + self, references: List[float], prediction: float, task_data: List[Dict] ) -> dict: - float_prediction = to_float_or_default(prediction) - prediction = str(int(float_prediction > self.threshold)) - references = ["1"] if references[0].lower() in self.pos_classes else ["0"] + prediction = int(prediction > self.threshold) + reference = int(references[0]) - result = {self.main_score: float([prediction] == references)} + result = {self.main_score: float(prediction == reference)} result["score"] = result[self.main_score] result["score_name"] = self.main_score return result @@ -3396,9 +3426,7 @@ class BinaryMaxAccuracy(GlobalMetric): process_single_instances = False main_score = "max_accuracy_binary" - pos_classes = {"1", "1.0", "yes", "true"} - - prediction_type = "str" + prediction_type = "Union[float,int]" single_reference_per_prediction = True def compute( @@ -3407,10 +3435,7 @@ def compute( predictions: List[str], task_data: List[Dict], ) -> dict: - float_predictions = [to_float_or_default(p) for p in predictions] - references = [ - ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references - ] + references = [[int(r[0])] for r in references] # Sticking to the test >= thr, accuracy induced by threshold thr is the number of float predictions # that pass the test (are >= thr) and are paired with reference "1" plus the number of float predictions that @@ -3421,8 +3446,8 @@ def compute( # the largest float predictions, to induce the partition into all-failing , none-passing. fp = [ - (float_predictions[i], i, -1 if references[i][0] == "1" else +1) - for i in range(len(float_predictions)) + (predictions[i], i, -1 if references[i][0] == 1 else +1) + for i in range(len(predictions)) ] fp.sort() # each triplet above: float-prediction f; f's ordinal position in float_predictions, which is also @@ -3436,7 +3461,7 @@ def compute( current_thr = fp[0][0] # partition float_predictions into all-passing, none-failing - current_acc = sum(r[0] == "1" for r in references) + current_acc = sum(r[0] == 1 for r in references) # number of predictions that thr sends to the reference they are paired with best_acc = current_acc diff --git a/src/unitxt/task.py b/src/unitxt/task.py index 7fc2d8e6f..79c8dcf6e 100644 --- a/src/unitxt/task.py +++ b/src/unitxt/task.py @@ -3,7 +3,13 @@ from .artifact import fetch_artifact from .logging_utils import get_logger from .operator import StreamInstanceOperator -from .type_utils import isoftype, parse_type_string, verify_required_schema +from .type_utils import ( + get_args, + get_origin, + isoftype, + parse_type_string, + verify_required_schema, +) class Tasker: @@ -79,6 +85,10 @@ def check_metrics_type(self) -> None: prediction_type == metric_prediction_type or prediction_type == Any or metric_prediction_type == Any + or ( + get_origin(metric_prediction_type) is Union + and prediction_type in get_args(metric_prediction_type) + ) ): continue diff --git a/tests/library/test_artifact.py b/tests/library/test_artifact.py index 6aed5f73f..eb8d2c2a6 100644 --- a/tests/library/test_artifact.py +++ b/tests/library/test_artifact.py @@ -68,9 +68,9 @@ def test_artifact_loading_with_overwrite_args_with_list_of_operators(self): self.assertEqual(artifact.steps[0].string, "no") def test_artifact_loading_with_overwrite_args_list(self): - artifact_identifier = "tasks.classification.binary.zero_or_one[metrics=[metrics.rouge, metrics.accuracy]]" + artifact_identifier = "tasks.classification.binary.zero_or_one[metrics=[metrics.roc_auc, metrics.accuracy]]" artifact, _ = fetch_artifact(artifact_identifier) - self.assertEqual(artifact.metrics, ["metrics.rouge", "metrics.accuracy"]) + self.assertEqual(artifact.metrics, ["metrics.roc_auc", "metrics.accuracy"]) def test_artifact_loading_with_overwrite_args_dict(self): with temp_catalog() as catalog_path: diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py index 0af194346..966d2418a 100644 --- a/tests/library/test_metrics.py +++ b/tests/library/test_metrics.py @@ -291,29 +291,37 @@ def test_f1_errors(self): def test_f1_binary(self): metric = F1Binary() - references = [["1"], ["0"], ["0"], ["0"], ["Yes"], ["1"]] - predictions = ["0.8", "1", "0.2", "0", "0.6", "1"] + references = [[1], [0], [0], [0], [1], [1]] + predictions = [0.8, 1, 0.2, 0, 0.6, 1] global_target = 0.8571428571428 + global_target_neg = 0.8 outputs = apply_metric( metric=metric, predictions=predictions, references=references ) self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"]) + self.assertAlmostEqual( + global_target_neg, outputs[0]["score"]["global"]["f1_binary_neg"] + ) self.assertEqual("f1_binary", outputs[0]["score"]["global"]["score_name"]) self.assertEqual("f1_binary", outputs[0]["score"]["instance"]["score_name"]) def test_precision_binary(self): metric = PrecisionBinary() - references = [["1"], ["0"], ["0"], ["0"], ["1"], ["1"]] - predictions = ["1", "1", "0", "0", "1", "1"] + references = [[1], [0], [0], [0.0], [1.0], [1]] + predictions = [0.9, 0.6, 0, 0.2, 1, 0.8] global_target = 0.75 + global_target_neg = 1 outputs = apply_metric( metric=metric, predictions=predictions, references=references ) self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"]) + self.assertAlmostEqual( + global_target_neg, outputs[0]["score"]["global"]["precision_binary_neg"] + ) self.assertEqual( "precision_binary", outputs[0]["score"]["global"]["score_name"] ) @@ -323,36 +331,55 @@ def test_precision_binary(self): def test_recall_binary(self): metric = RecallBinary() - references = [["1"], ["0"], ["0"], ["0"], ["1"], ["1"]] - predictions = ["1", "1", "0", "0", "1", "1"] + references = [[1], [0], [0], [0], [1], [1]] + predictions = [0.9, 0.6, 0, 0.2, 1, 0.8] global_target = 1 + global_target_neg = 0.666666666 outputs = apply_metric( metric=metric, predictions=predictions, references=references ) self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"]) + self.assertAlmostEqual( + global_target_neg, outputs[0]["score"]["global"]["recall_binary_neg"] + ) self.assertEqual("recall_binary", outputs[0]["score"]["global"]["score_name"]) self.assertEqual("recall_binary", outputs[0]["score"]["instance"]["score_name"]) def test_max_f1(self): metric = BinaryMaxF1() - references = [["1"], ["0"], ["0"]] - predictions = ["0.3", "0", "0.7"] + references = [[1], [0], [0], [0]] + predictions = [0.3, 0, 0.7, 0] global_target = 0.666666666666 + global_target_neg = 0.8 outputs = apply_metric( metric=metric, predictions=predictions, references=references ) self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"]) + self.assertAlmostEqual( + global_target_neg, outputs[0]["score"]["global"]["max_f1_binary_neg"] + ) self.assertEqual("max_f1_binary", outputs[0]["score"]["global"]["score_name"]) self.assertEqual("max_f1_binary", outputs[0]["score"]["instance"]["score_name"]) + def test_max_f1_single_class(self): + metric = BinaryMaxF1() + references = [[0], [0], [0], [0]] + predictions = [0.3, 0, 0.7, 0] + + global_target = 0.0 + outputs = apply_metric( + metric=metric, predictions=predictions, references=references + ) + self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"]) + def test_accuracy_binary(self): metric = BinaryAccuracy() - references = [["1"], ["0"], ["0"], ["1"], ["0"]] - predictions = ["0.3", "0", "0.7", "1.0", "0.2"] + references = [[1], [0], [0], [1], [0]] + predictions = [0.3, 0, 0.7, 1.0, 0.2] expected_global_result = { "accuracy_binary": 3 / 5, @@ -372,8 +399,8 @@ def test_accuracy_binary(self): def test_binary_max_accuracy(self): metric = BinaryMaxAccuracy() - references = [["1"], ["0"], ["0"], ["1"], ["0"]] - predictions = ["0.3", "0", "0.7", "1.0", "0.2"] + references = [[1], [0], [0], [1], [0]] + predictions = [0.3, 0, 0.7, 1.0, 0.2] global_target = 0.8 outputs = apply_metric( @@ -388,36 +415,36 @@ def test_binary_max_accuracy(self): "max_accuracy_binary", outputs[0]["score"]["instance"]["score_name"] ) - references = [["0"], ["0"], ["0"]] - predictions = ["0.3", "0.9", "0.7"] + references = [[0], [0], [0]] + predictions = [0.3, 0.9, 0.7] outputs = apply_metric( metric=metric, predictions=predictions, references=references ) self.assertAlmostEqual(1.0, outputs[0]["score"]["global"]["score"]) - references = [["1"], ["0"], ["0"], ["1"], ["0"], ["0"]] - predictions = ["0.7", "0.3", "0.7", "0.8", "0.9", "0.3"] + references = [[1], [0], [0], [1], [0], [0]] + predictions = [0.7, 0.3, 0.7, 0.8, 0.9, 0.3] outputs = apply_metric( metric=metric, predictions=predictions, references=references ) self.assertAlmostEqual(2 / 3, outputs[0]["score"]["global"]["score"]) - references = [["1"]] - predictions = ["0.7"] + references = [[1]] + predictions = [0.7] outputs = apply_metric( metric=metric, predictions=predictions, references=references ) self.assertAlmostEqual(1.0, outputs[0]["score"]["global"]["score"]) - references = [["0"]] - predictions = ["0.7"] + references = [[0]] + predictions = [0.7] outputs = apply_metric( metric=metric, predictions=predictions, references=references ) self.assertAlmostEqual(1.0, outputs[0]["score"]["global"]["score"]) - references = [["0"]] - predictions = ["1.7"] + references = [[0]] + predictions = [1.7] outputs = apply_metric( metric=metric, predictions=predictions, references=references ) @@ -723,8 +750,8 @@ def test_token_overlap(self): def test_roc_auc(self): metric = RocAuc() - predictions = ["0.2", "0.8", "1.0"] - references = [["1.0"], ["0.0"], ["1.0"]] + predictions = [0.2, 0.8, 1.0] + references = [[1.0], [0.0], [1.0]] outputs = apply_metric( metric=metric, predictions=predictions, references=references ) @@ -733,8 +760,8 @@ def test_roc_auc(self): def test_kendalltau(self): metric = KendallTauMetric() - predictions = ["1.0", "2.0", "1.0"] - references = [["-1.0"], ["1.0"], ["0.0"]] + predictions = [1.0, 2.0, 1.0] + references = [[-1.0], [1.0], [0.0]] outputs = apply_metric( metric=metric, predictions=predictions, references=references ) diff --git a/tests/library/test_tasks.py b/tests/library/test_tasks.py index 879afe645..4c0ff9953 100644 --- a/tests/library/test_tasks.py +++ b/tests/library/test_tasks.py @@ -58,7 +58,7 @@ def test_task_metrics_type_checking(self): inputs={"input": "str"}, outputs={"label": "str"}, prediction_type="str", - metrics=["metrics.wer", "metrics.rouge", "metrics.roc_auc"], + metrics=["metrics.wer", "metrics.rouge"], ) operator.check_metrics_type()