Skip to content

Commit

Permalink
add neg class score for binary precision, recall, f1 and max f1 (#788)
Browse files Browse the repository at this point in the history
* add perplexity with Mistral_7B_Instruct_v2

Signed-off-by: lilacheden <[email protected]>

* add perplexity test

Signed-off-by: lilacheden <[email protected]>

* avoid test memory issues

Signed-off-by: lilacheden <[email protected]>

* round traversed thresholds for max f1 to 3

Signed-off-by: lilacheden <[email protected]>

* processor for extracting model prediction enclosed in double brackets

Signed-off-by: Ariel Gera <[email protected]>

* fix syntax error

Signed-off-by: lilacheden <[email protected]>

* fix code formatting

Signed-off-by: lilacheden <[email protected]>

* add modified metrics to catalog

Signed-off-by: lilacheden <[email protected]>

* change template name

Signed-off-by: lilacheden <[email protected]>

* remove perplexity with mistral

Signed-off-by: lilacheden <[email protected]>

* add neg class score for binary precision, recall, f1 and max f1

Signed-off-by: lilacheden <[email protected]>

* allow spearman metric to receive str inputs

Signed-off-by: lilacheden <[email protected]>

* support running AbstractLM on local Apple GPU

Signed-off-by: lilacheden <[email protected]>

* Support Unions in metric prediction_type

Signed-off-by: lilacheden <[email protected]>

* add processor cast_to_float_return_nan_if_failed

Signed-off-by: lilacheden <[email protected]>

* Make prediction_type of metrics numeric (float or int)

Modified Metrics: "metrics.kendalltau_b", "metrics.roc_auc","metrics.f1_binary","metrics.accuracy_binary",
"metrics.precision_binary", "metrics.recall_binary", "metrics.max_f1_binary", "metrics.max_accuracy_binary",

Signed-off-by: lilacheden <[email protected]>

* Adjust coedit task and template to metrics change

Signed-off-by: lilacheden <[email protected]>

* Adjust test_task_metrics_type_checking to metrics change

Signed-off-by: lilacheden <[email protected]>

* Adjust test_artifact_loading_with_overwrite_args_list to metrics change

Signed-off-by: lilacheden <[email protected]>

* revert support running AbstractLM on local Apple GPU

Signed-off-by: lilacheden <[email protected]>

---------

Signed-off-by: lilacheden <[email protected]>
Signed-off-by: Ariel Gera <[email protected]>
Co-authored-by: Ariel Gera <[email protected]>
Co-authored-by: Elron Bandel <[email protected]>
  • Loading branch information
3 people authored May 5, 2024
1 parent 6cade91 commit 4775531
Show file tree
Hide file tree
Showing 13 changed files with 186 additions and 84 deletions.
4 changes: 2 additions & 2 deletions prepare/metrics/kendalltau.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

metric = KendallTauMetric()

predictions = ["1.0", "2.0", "1.0"]
references = [["-1.0"], ["1.0"], ["0.0"]]
predictions = [1.0, 2.0, 1.0]
references = [[-1.0], [1.0], [0.0]]

instance_targets = [
{
Expand Down
4 changes: 2 additions & 2 deletions prepare/metrics/roc_auc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

metric = RocAuc()

predictions = ["0.2", "0.8", "1.0"]
references = [["1.0"], ["0.0"], ["1.0"]]
predictions = [0.2, 0.8, 1.0]
references = [[1.0], [0.0], [1.0]]

instance_targets = [{"roc_auc": np.nan, "score": np.nan, "score_name": "roc_auc"}] * 3
global_targets = {
Expand Down
17 changes: 17 additions & 0 deletions prepare/processors/processors.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
from unitxt import add_to_catalog
from unitxt.logging_utils import get_logger
from unitxt.operator import SequentialOperator
Expand Down Expand Up @@ -296,6 +297,22 @@
overwrite=True,
)

add_to_catalog(
SequentialOperator(
steps=[
CastFields(
fields={"prediction": "float"},
failure_defaults={"prediction": np.nan},
),
CastFields(
fields={"references": "float"},
process_every_value=True,
),
]
),
"processors.cast_to_float_return_nan_if_failed",
overwrite=True,
)

add_to_catalog(
SequentialOperator(
Expand Down
2 changes: 1 addition & 1 deletion prepare/tasks/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
FormTask(
inputs={"text": "str", "text_type": "str", "class": "str"},
outputs={"class": "str", "label": "int"},
prediction_type="str",
prediction_type="float",
metrics=[
"metrics.accuracy",
"metrics.f1_binary",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"processors.take_first_word",
"processors.lower_case",
"processors.yes_no_to_int",
"processors.cast_to_float_return_nan_if_failed",
],
),
"templates.grammatical_error_detection.yes_no",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"type": "sequential_operator",
"steps": [
{
"type": "cast_fields",
"fields": {
"prediction": "float"
},
"failure_defaults": {
"prediction": NaN
}
},
{
"type": "cast_fields",
"fields": {
"references": "float"
},
"process_every_value": true
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"class": "str",
"label": "int"
},
"prediction_type": "str",
"prediction_type": "float",
"metrics": [
"metrics.accuracy",
"metrics.f1_binary"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"postprocessors": [
"processors.take_first_word",
"processors.lower_case",
"processors.yes_no_to_int"
"processors.yes_no_to_int",
"processors.cast_to_float_return_nan_if_failed"
]
}
119 changes: 72 additions & 47 deletions src/unitxt/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from .random_utils import get_seed
from .settings_utils import get_settings
from .stream import MultiStream, Stream
from .type_utils import isoftype, parse_type_string, to_float_or_default
from .type_utils import isoftype, parse_type_string

logger = get_logger()
settings = get_settings()
Expand Down Expand Up @@ -1261,30 +1261,50 @@ class F1Micro(F1):
average = "micro"


class F1Binary(F1):
class F1Binary(GlobalMetric):
"""Calculate f1 for a binary task, using 0.5 as the threshold in the case of float predictions."""

process_single_instances = False
main_score = "f1_binary"
average = "binary"
pos_classes = {"1", "1.0", "yes", "true"}
average = None
threshold = 0.5
prediction_type = "Union[float, int]"
_metric = None
metric = "f1"
single_reference_per_prediction = True

def get_str_id(self, str):
return int(str)
def prepare(self):
super().prepare()
self._metric = evaluate.load(self.metric)

def _validate_reference(self, reference):
super()._validate_reference(reference)
assert reference[0] in [
0,
1,
], f"all references of {self.main_score} must by 0 or 1"

def compute(
self,
references: List[List[str]],
predictions: List[str],
task_data: List[Dict],
) -> dict:
predictions_floats = [to_float_or_default(p) for p in predictions]
predictions = [str(int(p > self.threshold)) for p in predictions_floats]
references = [
["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
]
return super().compute(references, predictions, task_data)
flattened_int_references = [int(r[0]) for r in references]
int_predictions = [int(p > self.threshold) for p in predictions]

result = self._metric.compute(
references=flattened_int_references,
predictions=int_predictions,
labels=[0, 1],
average=self.average,
)
if isinstance(result[self.metric], numpy.ndarray):
return {
self.main_score: result[self.metric][1],
f"{self.main_score}_neg": result[self.metric][0],
}
return {self.main_score: result[self.metric]}


class RecallBinary(F1Binary):
Expand Down Expand Up @@ -1538,7 +1558,7 @@ class KendallTauMetric(GlobalMetric):
main_score = "kendalltau_b"
variant = "b"
process_single_instances = False
prediction_type = "str"
prediction_type = "float"

_requirements_list: List[str] = ["scipy"]

Expand All @@ -1555,8 +1575,6 @@ def compute(
) -> dict:
if isinstance(references[0], list):
references = [reference[0] for reference in references]
references = [to_float_or_default(r) for r in references]
predictions = [to_float_or_default(p) for p in predictions]

kendall_results = self.kendalltau(references, predictions, variant=self.variant)
corr = kendall_results.correlation
Expand Down Expand Up @@ -1602,7 +1620,7 @@ class RocAuc(GlobalMetric):
process_single_instances = False
_requirements_list: List[str] = ["sklearn"]
single_reference_per_prediction = True
prediction_type = "str"
prediction_type = "float"

def prepare(self):
from sklearn import metrics
Expand All @@ -1618,8 +1636,6 @@ def compute(
) -> dict:
if isinstance(references[0], list):
references = [reference[0] for reference in references]
references = [to_float_or_default(r) for r in references]
predictions = [to_float_or_default(p) for p in predictions]

false_positive_rates, true_positive_rates, _ = self.roc_curve(
y_true=references, y_score=predictions
Expand Down Expand Up @@ -3337,33 +3353,42 @@ class BinaryMaxF1(F1Binary):
"""Calculate the maximal F1 and the decision threshold that achieves it for a binary task with float predictions."""

main_score = "max_f1_binary"
prediction_type = str
single_reference_per_prediction = True

def compute(
self,
references: List[List[str]],
predictions: List[List[str]],
references: List[List[float]],
predictions: List[List[float]],
task_data: List[Dict],
) -> dict:
float_predictions = [to_float_or_default(p) for p in predictions]

best_thr = -1
best_f1 = -1
thrs = {round(fp, 3) for fp in float_predictions}
best_thr_neg = -1
best_f1_neg = -1
thrs = {round(fp, 3) for fp in predictions}
for thr in thrs:
new_predictions = [
"1" if float_prediction >= thr else "0"
for float_prediction in float_predictions
]
f1 = super().compute(references, new_predictions, task_data)[
self.main_score
1.0 if float_prediction >= thr else 0.0
for float_prediction in predictions
]
f1_results = super().compute(references, new_predictions, task_data)

f1 = f1_results[self.main_score]
if f1 > best_f1:
best_f1 = f1
best_thr = thr

return {self.main_score: best_f1, "best_thr_maxf1": best_thr}
f1_neg = f1_results[f"{self.main_score}_neg"]
if f1_neg > best_f1_neg:
best_f1_neg = f1_neg
best_thr_neg = thr

return {
self.main_score: best_f1,
"best_thr_maxf1": best_thr,
f"{self.main_score}_neg": best_f1_neg,
"best_thr_maxf1_neg": best_thr_neg,
}


class BinaryAccuracy(InstanceMetric):
Expand All @@ -3372,20 +3397,25 @@ class BinaryAccuracy(InstanceMetric):
reduction_map = {"mean": ["accuracy_binary"]}
main_score = "accuracy_binary"
ci_scores = ["accuracy_binary"]
pos_classes = {"1", "1.0", "yes", "true"}
threshold = 0.5

prediction_type = "str"
prediction_type = "Union[float,int]"
single_reference_per_prediction = True

def _validate_reference(self, reference):
super()._validate_reference(reference)
assert reference[0] in [
0,
1,
], f"all references of {self.main_score} must by 0 or 1"

def compute(
self, references: List[Any], prediction: Any, task_data: List[Dict]
self, references: List[float], prediction: float, task_data: List[Dict]
) -> dict:
float_prediction = to_float_or_default(prediction)
prediction = str(int(float_prediction > self.threshold))
references = ["1"] if references[0].lower() in self.pos_classes else ["0"]
prediction = int(prediction > self.threshold)
reference = int(references[0])

result = {self.main_score: float([prediction] == references)}
result = {self.main_score: float(prediction == reference)}
result["score"] = result[self.main_score]
result["score_name"] = self.main_score
return result
Expand All @@ -3396,9 +3426,7 @@ class BinaryMaxAccuracy(GlobalMetric):

process_single_instances = False
main_score = "max_accuracy_binary"
pos_classes = {"1", "1.0", "yes", "true"}

prediction_type = "str"
prediction_type = "Union[float,int]"
single_reference_per_prediction = True

def compute(
Expand All @@ -3407,10 +3435,7 @@ def compute(
predictions: List[str],
task_data: List[Dict],
) -> dict:
float_predictions = [to_float_or_default(p) for p in predictions]
references = [
["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
]
references = [[int(r[0])] for r in references]

# Sticking to the test >= thr, accuracy induced by threshold thr is the number of float predictions
# that pass the test (are >= thr) and are paired with reference "1" plus the number of float predictions that
Expand All @@ -3421,8 +3446,8 @@ def compute(
# the largest float predictions, to induce the partition into all-failing , none-passing.

fp = [
(float_predictions[i], i, -1 if references[i][0] == "1" else +1)
for i in range(len(float_predictions))
(predictions[i], i, -1 if references[i][0] == 1 else +1)
for i in range(len(predictions))
]
fp.sort()
# each triplet above: float-prediction f; f's ordinal position in float_predictions, which is also
Expand All @@ -3436,7 +3461,7 @@ def compute(

current_thr = fp[0][0]
# partition float_predictions into all-passing, none-failing
current_acc = sum(r[0] == "1" for r in references)
current_acc = sum(r[0] == 1 for r in references)
# number of predictions that thr sends to the reference they are paired with

best_acc = current_acc
Expand Down
12 changes: 11 additions & 1 deletion src/unitxt/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
from .artifact import fetch_artifact
from .logging_utils import get_logger
from .operator import StreamInstanceOperator
from .type_utils import isoftype, parse_type_string, verify_required_schema
from .type_utils import (
get_args,
get_origin,
isoftype,
parse_type_string,
verify_required_schema,
)


class Tasker:
Expand Down Expand Up @@ -79,6 +85,10 @@ def check_metrics_type(self) -> None:
prediction_type == metric_prediction_type
or prediction_type == Any
or metric_prediction_type == Any
or (
get_origin(metric_prediction_type) is Union
and prediction_type in get_args(metric_prediction_type)
)
):
continue

Expand Down
4 changes: 2 additions & 2 deletions tests/library/test_artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ def test_artifact_loading_with_overwrite_args_with_list_of_operators(self):
self.assertEqual(artifact.steps[0].string, "no")

def test_artifact_loading_with_overwrite_args_list(self):
artifact_identifier = "tasks.classification.binary.zero_or_one[metrics=[metrics.rouge, metrics.accuracy]]"
artifact_identifier = "tasks.classification.binary.zero_or_one[metrics=[metrics.roc_auc, metrics.accuracy]]"
artifact, _ = fetch_artifact(artifact_identifier)
self.assertEqual(artifact.metrics, ["metrics.rouge", "metrics.accuracy"])
self.assertEqual(artifact.metrics, ["metrics.roc_auc", "metrics.accuracy"])

def test_artifact_loading_with_overwrite_args_dict(self):
with temp_catalog() as catalog_path:
Expand Down
Loading

0 comments on commit 4775531

Please sign in to comment.