diff --git a/prepare/metrics/kendalltau.py b/prepare/metrics/kendalltau.py
index 8f7d5e614..c0576f5d7 100644
--- a/prepare/metrics/kendalltau.py
+++ b/prepare/metrics/kendalltau.py
@@ -5,8 +5,8 @@
 
 metric = KendallTauMetric()
 
-predictions = ["1.0", "2.0", "1.0"]
-references = [["-1.0"], ["1.0"], ["0.0"]]
+predictions = [1.0, 2.0, 1.0]
+references = [[-1.0], [1.0], [0.0]]
 
 instance_targets = [
     {
diff --git a/prepare/metrics/roc_auc.py b/prepare/metrics/roc_auc.py
index 72a139259..7a615caeb 100644
--- a/prepare/metrics/roc_auc.py
+++ b/prepare/metrics/roc_auc.py
@@ -5,8 +5,8 @@
 
 metric = RocAuc()
 
-predictions = ["0.2", "0.8", "1.0"]
-references = [["1.0"], ["0.0"], ["1.0"]]
+predictions = [0.2, 0.8, 1.0]
+references = [[1.0], [0.0], [1.0]]
 
 instance_targets = [{"roc_auc": np.nan, "score": np.nan, "score_name": "roc_auc"}] * 3
 global_targets = {
diff --git a/prepare/processors/processors.py b/prepare/processors/processors.py
index 0be7ae2f0..be9ec9e07 100644
--- a/prepare/processors/processors.py
+++ b/prepare/processors/processors.py
@@ -1,3 +1,4 @@
+import numpy as np
 from unitxt import add_to_catalog
 from unitxt.logging_utils import get_logger
 from unitxt.operator import SequentialOperator
@@ -296,6 +297,22 @@
     overwrite=True,
 )
 
+add_to_catalog(
+    SequentialOperator(
+        steps=[
+            CastFields(
+                fields={"prediction": "float"},
+                failure_defaults={"prediction": np.nan},
+            ),
+            CastFields(
+                fields={"references": "float"},
+                process_every_value=True,
+            ),
+        ]
+    ),
+    "processors.cast_to_float_return_nan_if_failed",
+    overwrite=True,
+)
 
 add_to_catalog(
     SequentialOperator(
diff --git a/prepare/tasks/classification.py b/prepare/tasks/classification.py
index 6be65d89f..100694695 100644
--- a/prepare/tasks/classification.py
+++ b/prepare/tasks/classification.py
@@ -21,7 +21,7 @@
     FormTask(
         inputs={"text": "str", "text_type": "str", "class": "str"},
         outputs={"class": "str", "label": "int"},
-        prediction_type="str",
+        prediction_type="float",
         metrics=[
             "metrics.accuracy",
             "metrics.f1_binary",
diff --git a/prepare/templates/classification/grammatical_error_detection.py b/prepare/templates/classification/grammatical_error_detection.py
index 933e54ef9..fca5b935f 100644
--- a/prepare/templates/classification/grammatical_error_detection.py
+++ b/prepare/templates/classification/grammatical_error_detection.py
@@ -11,6 +11,7 @@
             "processors.take_first_word",
             "processors.lower_case",
             "processors.yes_no_to_int",
+            "processors.cast_to_float_return_nan_if_failed",
         ],
     ),
     "templates.grammatical_error_detection.yes_no",
diff --git a/src/unitxt/catalog/processors/cast_to_float_return_nan_if_failed.json b/src/unitxt/catalog/processors/cast_to_float_return_nan_if_failed.json
new file mode 100644
index 000000000..136b32102
--- /dev/null
+++ b/src/unitxt/catalog/processors/cast_to_float_return_nan_if_failed.json
@@ -0,0 +1,21 @@
+{
+    "type": "sequential_operator",
+    "steps": [
+        {
+            "type": "cast_fields",
+            "fields": {
+                "prediction": "float"
+            },
+            "failure_defaults": {
+                "prediction": NaN
+            }
+        },
+        {
+            "type": "cast_fields",
+            "fields": {
+                "references": "float"
+            },
+            "process_every_value": true
+        }
+    ]
+}
diff --git a/src/unitxt/catalog/tasks/classification/binary/zero_or_one.json b/src/unitxt/catalog/tasks/classification/binary/zero_or_one.json
index f706904c7..fbe2c8566 100644
--- a/src/unitxt/catalog/tasks/classification/binary/zero_or_one.json
+++ b/src/unitxt/catalog/tasks/classification/binary/zero_or_one.json
@@ -9,7 +9,7 @@
         "class": "str",
         "label": "int"
     },
-    "prediction_type": "str",
+    "prediction_type": "float",
     "metrics": [
         "metrics.accuracy",
         "metrics.f1_binary"
diff --git a/src/unitxt/catalog/templates/grammatical_error_detection/yes_no.json b/src/unitxt/catalog/templates/grammatical_error_detection/yes_no.json
index 8f98abbe7..518f1a1c9 100644
--- a/src/unitxt/catalog/templates/grammatical_error_detection/yes_no.json
+++ b/src/unitxt/catalog/templates/grammatical_error_detection/yes_no.json
@@ -7,6 +7,7 @@
     "postprocessors": [
         "processors.take_first_word",
         "processors.lower_case",
-        "processors.yes_no_to_int"
+        "processors.yes_no_to_int",
+        "processors.cast_to_float_return_nan_if_failed"
     ]
 }
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index b46907d27..f1295169c 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -29,7 +29,7 @@
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
-from .type_utils import isoftype, parse_type_string, to_float_or_default
+from .type_utils import isoftype, parse_type_string
 
 logger = get_logger()
 settings = get_settings()
@@ -1261,17 +1261,28 @@ class F1Micro(F1):
     average = "micro"
 
 
-class F1Binary(F1):
+class F1Binary(GlobalMetric):
     """Calculate f1 for a binary task, using 0.5 as the threshold in the case of float predictions."""
 
     process_single_instances = False
     main_score = "f1_binary"
-    average = "binary"
-    pos_classes = {"1", "1.0", "yes", "true"}
+    average = None
     threshold = 0.5
+    prediction_type = "Union[float, int]"
+    _metric = None
+    metric = "f1"
+    single_reference_per_prediction = True
 
-    def get_str_id(self, str):
-        return int(str)
+    def prepare(self):
+        super().prepare()
+        self._metric = evaluate.load(self.metric)
+
+    def _validate_reference(self, reference):
+        super()._validate_reference(reference)
+        assert reference[0] in [
+            0,
+            1,
+        ], f"all references of {self.main_score} must by 0 or 1"
 
     def compute(
         self,
@@ -1279,12 +1290,21 @@ def compute(
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
-        predictions_floats = [to_float_or_default(p) for p in predictions]
-        predictions = [str(int(p > self.threshold)) for p in predictions_floats]
-        references = [
-            ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
-        ]
-        return super().compute(references, predictions, task_data)
+        flattened_int_references = [int(r[0]) for r in references]
+        int_predictions = [int(p > self.threshold) for p in predictions]
+
+        result = self._metric.compute(
+            references=flattened_int_references,
+            predictions=int_predictions,
+            labels=[0, 1],
+            average=self.average,
+        )
+        if isinstance(result[self.metric], numpy.ndarray):
+            return {
+                self.main_score: result[self.metric][1],
+                f"{self.main_score}_neg": result[self.metric][0],
+            }
+        return {self.main_score: result[self.metric]}
 
 
 class RecallBinary(F1Binary):
@@ -1538,7 +1558,7 @@ class KendallTauMetric(GlobalMetric):
     main_score = "kendalltau_b"
     variant = "b"
     process_single_instances = False
-    prediction_type = "str"
+    prediction_type = "float"
 
     _requirements_list: List[str] = ["scipy"]
 
@@ -1555,8 +1575,6 @@ def compute(
     ) -> dict:
         if isinstance(references[0], list):
             references = [reference[0] for reference in references]
-        references = [to_float_or_default(r) for r in references]
-        predictions = [to_float_or_default(p) for p in predictions]
 
         kendall_results = self.kendalltau(references, predictions, variant=self.variant)
         corr = kendall_results.correlation
@@ -1602,7 +1620,7 @@ class RocAuc(GlobalMetric):
     process_single_instances = False
     _requirements_list: List[str] = ["sklearn"]
     single_reference_per_prediction = True
-    prediction_type = "str"
+    prediction_type = "float"
 
     def prepare(self):
         from sklearn import metrics
@@ -1618,8 +1636,6 @@ def compute(
     ) -> dict:
         if isinstance(references[0], list):
             references = [reference[0] for reference in references]
-        references = [to_float_or_default(r) for r in references]
-        predictions = [to_float_or_default(p) for p in predictions]
 
         false_positive_rates, true_positive_rates, _ = self.roc_curve(
             y_true=references, y_score=predictions
@@ -3337,33 +3353,42 @@ class BinaryMaxF1(F1Binary):
     """Calculate the maximal F1 and the decision threshold that achieves it for a binary task with float predictions."""
 
     main_score = "max_f1_binary"
-    prediction_type = str
     single_reference_per_prediction = True
 
     def compute(
         self,
-        references: List[List[str]],
-        predictions: List[List[str]],
+        references: List[List[float]],
+        predictions: List[List[float]],
         task_data: List[Dict],
     ) -> dict:
-        float_predictions = [to_float_or_default(p) for p in predictions]
-
         best_thr = -1
         best_f1 = -1
-        thrs = {round(fp, 3) for fp in float_predictions}
+        best_thr_neg = -1
+        best_f1_neg = -1
+        thrs = {round(fp, 3) for fp in predictions}
         for thr in thrs:
             new_predictions = [
-                "1" if float_prediction >= thr else "0"
-                for float_prediction in float_predictions
-            ]
-            f1 = super().compute(references, new_predictions, task_data)[
-                self.main_score
+                1.0 if float_prediction >= thr else 0.0
+                for float_prediction in predictions
             ]
+            f1_results = super().compute(references, new_predictions, task_data)
+
+            f1 = f1_results[self.main_score]
             if f1 > best_f1:
                 best_f1 = f1
                 best_thr = thr
 
-        return {self.main_score: best_f1, "best_thr_maxf1": best_thr}
+            f1_neg = f1_results[f"{self.main_score}_neg"]
+            if f1_neg > best_f1_neg:
+                best_f1_neg = f1_neg
+                best_thr_neg = thr
+
+        return {
+            self.main_score: best_f1,
+            "best_thr_maxf1": best_thr,
+            f"{self.main_score}_neg": best_f1_neg,
+            "best_thr_maxf1_neg": best_thr_neg,
+        }
 
 
 class BinaryAccuracy(InstanceMetric):
@@ -3372,20 +3397,25 @@ class BinaryAccuracy(InstanceMetric):
     reduction_map = {"mean": ["accuracy_binary"]}
     main_score = "accuracy_binary"
     ci_scores = ["accuracy_binary"]
-    pos_classes = {"1", "1.0", "yes", "true"}
     threshold = 0.5
 
-    prediction_type = "str"
+    prediction_type = "Union[float,int]"
     single_reference_per_prediction = True
 
+    def _validate_reference(self, reference):
+        super()._validate_reference(reference)
+        assert reference[0] in [
+            0,
+            1,
+        ], f"all references of {self.main_score} must by 0 or 1"
+
     def compute(
-        self, references: List[Any], prediction: Any, task_data: List[Dict]
+        self, references: List[float], prediction: float, task_data: List[Dict]
     ) -> dict:
-        float_prediction = to_float_or_default(prediction)
-        prediction = str(int(float_prediction > self.threshold))
-        references = ["1"] if references[0].lower() in self.pos_classes else ["0"]
+        prediction = int(prediction > self.threshold)
+        reference = int(references[0])
 
-        result = {self.main_score: float([prediction] == references)}
+        result = {self.main_score: float(prediction == reference)}
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score
         return result
@@ -3396,9 +3426,7 @@ class BinaryMaxAccuracy(GlobalMetric):
 
     process_single_instances = False
     main_score = "max_accuracy_binary"
-    pos_classes = {"1", "1.0", "yes", "true"}
-
-    prediction_type = "str"
+    prediction_type = "Union[float,int]"
     single_reference_per_prediction = True
 
     def compute(
@@ -3407,10 +3435,7 @@ def compute(
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
-        float_predictions = [to_float_or_default(p) for p in predictions]
-        references = [
-            ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
-        ]
+        references = [[int(r[0])] for r in references]
 
         # Sticking to the test >= thr, accuracy induced by threshold thr is the number of float predictions
         # that pass the test (are >= thr) and are paired with reference "1" plus the number of float predictions that
@@ -3421,8 +3446,8 @@ def compute(
         # the largest float predictions, to induce the partition into all-failing , none-passing.
 
         fp = [
-            (float_predictions[i], i, -1 if references[i][0] == "1" else +1)
-            for i in range(len(float_predictions))
+            (predictions[i], i, -1 if references[i][0] == 1 else +1)
+            for i in range(len(predictions))
         ]
         fp.sort()
         # each triplet above: float-prediction f; f's ordinal position in float_predictions, which is also
@@ -3436,7 +3461,7 @@ def compute(
 
         current_thr = fp[0][0]
         # partition float_predictions into all-passing, none-failing
-        current_acc = sum(r[0] == "1" for r in references)
+        current_acc = sum(r[0] == 1 for r in references)
         # number of predictions that thr sends to the reference they are paired with
 
         best_acc = current_acc
diff --git a/src/unitxt/task.py b/src/unitxt/task.py
index 7fc2d8e6f..79c8dcf6e 100644
--- a/src/unitxt/task.py
+++ b/src/unitxt/task.py
@@ -3,7 +3,13 @@
 from .artifact import fetch_artifact
 from .logging_utils import get_logger
 from .operator import StreamInstanceOperator
-from .type_utils import isoftype, parse_type_string, verify_required_schema
+from .type_utils import (
+    get_args,
+    get_origin,
+    isoftype,
+    parse_type_string,
+    verify_required_schema,
+)
 
 
 class Tasker:
@@ -79,6 +85,10 @@ def check_metrics_type(self) -> None:
                 prediction_type == metric_prediction_type
                 or prediction_type == Any
                 or metric_prediction_type == Any
+                or (
+                    get_origin(metric_prediction_type) is Union
+                    and prediction_type in get_args(metric_prediction_type)
+                )
             ):
                 continue
 
diff --git a/tests/library/test_artifact.py b/tests/library/test_artifact.py
index 6aed5f73f..eb8d2c2a6 100644
--- a/tests/library/test_artifact.py
+++ b/tests/library/test_artifact.py
@@ -68,9 +68,9 @@ def test_artifact_loading_with_overwrite_args_with_list_of_operators(self):
             self.assertEqual(artifact.steps[0].string, "no")
 
     def test_artifact_loading_with_overwrite_args_list(self):
-        artifact_identifier = "tasks.classification.binary.zero_or_one[metrics=[metrics.rouge, metrics.accuracy]]"
+        artifact_identifier = "tasks.classification.binary.zero_or_one[metrics=[metrics.roc_auc, metrics.accuracy]]"
         artifact, _ = fetch_artifact(artifact_identifier)
-        self.assertEqual(artifact.metrics, ["metrics.rouge", "metrics.accuracy"])
+        self.assertEqual(artifact.metrics, ["metrics.roc_auc", "metrics.accuracy"])
 
     def test_artifact_loading_with_overwrite_args_dict(self):
         with temp_catalog() as catalog_path:
diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py
index 0af194346..966d2418a 100644
--- a/tests/library/test_metrics.py
+++ b/tests/library/test_metrics.py
@@ -291,29 +291,37 @@ def test_f1_errors(self):
 
     def test_f1_binary(self):
         metric = F1Binary()
-        references = [["1"], ["0"], ["0"], ["0"], ["Yes"], ["1"]]
-        predictions = ["0.8", "1", "0.2", "0", "0.6", "1"]
+        references = [[1], [0], [0], [0], [1], [1]]
+        predictions = [0.8, 1, 0.2, 0, 0.6, 1]
 
         global_target = 0.8571428571428
+        global_target_neg = 0.8
         outputs = apply_metric(
             metric=metric, predictions=predictions, references=references
         )
 
         self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"])
+        self.assertAlmostEqual(
+            global_target_neg, outputs[0]["score"]["global"]["f1_binary_neg"]
+        )
         self.assertEqual("f1_binary", outputs[0]["score"]["global"]["score_name"])
         self.assertEqual("f1_binary", outputs[0]["score"]["instance"]["score_name"])
 
     def test_precision_binary(self):
         metric = PrecisionBinary()
-        references = [["1"], ["0"], ["0"], ["0"], ["1"], ["1"]]
-        predictions = ["1", "1", "0", "0", "1", "1"]
+        references = [[1], [0], [0], [0.0], [1.0], [1]]
+        predictions = [0.9, 0.6, 0, 0.2, 1, 0.8]
 
         global_target = 0.75
+        global_target_neg = 1
         outputs = apply_metric(
             metric=metric, predictions=predictions, references=references
         )
 
         self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"])
+        self.assertAlmostEqual(
+            global_target_neg, outputs[0]["score"]["global"]["precision_binary_neg"]
+        )
         self.assertEqual(
             "precision_binary", outputs[0]["score"]["global"]["score_name"]
         )
@@ -323,36 +331,55 @@ def test_precision_binary(self):
 
     def test_recall_binary(self):
         metric = RecallBinary()
-        references = [["1"], ["0"], ["0"], ["0"], ["1"], ["1"]]
-        predictions = ["1", "1", "0", "0", "1", "1"]
+        references = [[1], [0], [0], [0], [1], [1]]
+        predictions = [0.9, 0.6, 0, 0.2, 1, 0.8]
 
         global_target = 1
+        global_target_neg = 0.666666666
         outputs = apply_metric(
             metric=metric, predictions=predictions, references=references
         )
 
         self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"])
+        self.assertAlmostEqual(
+            global_target_neg, outputs[0]["score"]["global"]["recall_binary_neg"]
+        )
         self.assertEqual("recall_binary", outputs[0]["score"]["global"]["score_name"])
         self.assertEqual("recall_binary", outputs[0]["score"]["instance"]["score_name"])
 
     def test_max_f1(self):
         metric = BinaryMaxF1()
-        references = [["1"], ["0"], ["0"]]
-        predictions = ["0.3", "0", "0.7"]
+        references = [[1], [0], [0], [0]]
+        predictions = [0.3, 0, 0.7, 0]
 
         global_target = 0.666666666666
+        global_target_neg = 0.8
         outputs = apply_metric(
             metric=metric, predictions=predictions, references=references
         )
 
         self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"])
+        self.assertAlmostEqual(
+            global_target_neg, outputs[0]["score"]["global"]["max_f1_binary_neg"]
+        )
         self.assertEqual("max_f1_binary", outputs[0]["score"]["global"]["score_name"])
         self.assertEqual("max_f1_binary", outputs[0]["score"]["instance"]["score_name"])
 
+    def test_max_f1_single_class(self):
+        metric = BinaryMaxF1()
+        references = [[0], [0], [0], [0]]
+        predictions = [0.3, 0, 0.7, 0]
+
+        global_target = 0.0
+        outputs = apply_metric(
+            metric=metric, predictions=predictions, references=references
+        )
+        self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"])
+
     def test_accuracy_binary(self):
         metric = BinaryAccuracy()
-        references = [["1"], ["0"], ["0"], ["1"], ["0"]]
-        predictions = ["0.3", "0", "0.7", "1.0", "0.2"]
+        references = [[1], [0], [0], [1], [0]]
+        predictions = [0.3, 0, 0.7, 1.0, 0.2]
 
         expected_global_result = {
             "accuracy_binary": 3 / 5,
@@ -372,8 +399,8 @@ def test_accuracy_binary(self):
 
     def test_binary_max_accuracy(self):
         metric = BinaryMaxAccuracy()
-        references = [["1"], ["0"], ["0"], ["1"], ["0"]]
-        predictions = ["0.3", "0", "0.7", "1.0", "0.2"]
+        references = [[1], [0], [0], [1], [0]]
+        predictions = [0.3, 0, 0.7, 1.0, 0.2]
 
         global_target = 0.8
         outputs = apply_metric(
@@ -388,36 +415,36 @@ def test_binary_max_accuracy(self):
             "max_accuracy_binary", outputs[0]["score"]["instance"]["score_name"]
         )
 
-        references = [["0"], ["0"], ["0"]]
-        predictions = ["0.3", "0.9", "0.7"]
+        references = [[0], [0], [0]]
+        predictions = [0.3, 0.9, 0.7]
         outputs = apply_metric(
             metric=metric, predictions=predictions, references=references
         )
         self.assertAlmostEqual(1.0, outputs[0]["score"]["global"]["score"])
 
-        references = [["1"], ["0"], ["0"], ["1"], ["0"], ["0"]]
-        predictions = ["0.7", "0.3", "0.7", "0.8", "0.9", "0.3"]
+        references = [[1], [0], [0], [1], [0], [0]]
+        predictions = [0.7, 0.3, 0.7, 0.8, 0.9, 0.3]
         outputs = apply_metric(
             metric=metric, predictions=predictions, references=references
         )
         self.assertAlmostEqual(2 / 3, outputs[0]["score"]["global"]["score"])
 
-        references = [["1"]]
-        predictions = ["0.7"]
+        references = [[1]]
+        predictions = [0.7]
         outputs = apply_metric(
             metric=metric, predictions=predictions, references=references
         )
         self.assertAlmostEqual(1.0, outputs[0]["score"]["global"]["score"])
 
-        references = [["0"]]
-        predictions = ["0.7"]
+        references = [[0]]
+        predictions = [0.7]
         outputs = apply_metric(
             metric=metric, predictions=predictions, references=references
         )
         self.assertAlmostEqual(1.0, outputs[0]["score"]["global"]["score"])
 
-        references = [["0"]]
-        predictions = ["1.7"]
+        references = [[0]]
+        predictions = [1.7]
         outputs = apply_metric(
             metric=metric, predictions=predictions, references=references
         )
@@ -723,8 +750,8 @@ def test_token_overlap(self):
 
     def test_roc_auc(self):
         metric = RocAuc()
-        predictions = ["0.2", "0.8", "1.0"]
-        references = [["1.0"], ["0.0"], ["1.0"]]
+        predictions = [0.2, 0.8, 1.0]
+        references = [[1.0], [0.0], [1.0]]
         outputs = apply_metric(
             metric=metric, predictions=predictions, references=references
         )
@@ -733,8 +760,8 @@ def test_roc_auc(self):
 
     def test_kendalltau(self):
         metric = KendallTauMetric()
-        predictions = ["1.0", "2.0", "1.0"]
-        references = [["-1.0"], ["1.0"], ["0.0"]]
+        predictions = [1.0, 2.0, 1.0]
+        references = [[-1.0], [1.0], [0.0]]
         outputs = apply_metric(
             metric=metric, predictions=predictions, references=references
         )
diff --git a/tests/library/test_tasks.py b/tests/library/test_tasks.py
index 879afe645..4c0ff9953 100644
--- a/tests/library/test_tasks.py
+++ b/tests/library/test_tasks.py
@@ -58,7 +58,7 @@ def test_task_metrics_type_checking(self):
             inputs={"input": "str"},
             outputs={"label": "str"},
             prediction_type="str",
-            metrics=["metrics.wer", "metrics.rouge", "metrics.roc_auc"],
+            metrics=["metrics.wer", "metrics.rouge"],
         )
 
         operator.check_metrics_type()