fix specificity calculation for CBPE (#334)

* fix specificity calculation for CBPE * Feature/multiclass confusion matrix (#287) * create tester * Updated tester * multiclass cm performance estimation * Multiclass confusion matrix calc. and estimation + docs and tests for both * Removed scratch testing files * updating MCM docs * Re-align docs with main version * [skip ci] Update CHANGELOG.md --------- Co-authored-by: Nikolaos Perrakis <[email protected]> Co-authored-by: Niels Nuyttens <[email protected]> * Small refactor to checks in realized performance calculations to make them consistent with the dedicated realized performance calculator. * Fix broken tests * Fix linting errors due to merges --------- Co-authored-by: Carter Blair <[email protected]> Co-authored-by: Niels Nuyttens <[email protected]>
NannyML · Nov 9, 2023 · 58d44bc · 58d44bc
1 parent 80507da
commit 58d44bc
Show file tree

Hide file tree

Showing 4 changed files with 586 additions and 65 deletions.
diff --git a/nannyml/performance_calculation/metrics/binary_classification.py b/nannyml/performance_calculation/metrics/binary_classification.py
@@ -1,12 +1,12 @@
 #  Author:   Niels Nuyttens  <[email protected]>
 #
 #  License: Apache Software License 2.0
+import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
 from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
-import warnings
 
 from nannyml._typing import ProblemType
 from nannyml.base import _list_missing

diff --git a/nannyml/performance_calculation/metrics/multiclass_classification.py b/nannyml/performance_calculation/metrics/multiclass_classification.py
@@ -7,24 +7,25 @@
 #  License: Apache Software License 2.0
 
 """Module containing metric utilities and implementations."""
+import warnings
 from typing import Dict, List, Optional, Tuple, Union  # noqa: TYP001
 
 import numpy as np
 import pandas as pd
-import warnings
 from sklearn.metrics import (
     accuracy_score,
+    confusion_matrix,
     f1_score,
     multilabel_confusion_matrix,
     precision_score,
     recall_score,
     roc_auc_score,
-    confusion_matrix,
 )
 from sklearn.preprocessing import LabelBinarizer, label_binarize
 
 from nannyml._typing import ProblemType, class_labels, model_output_column_names
 from nannyml.base import _list_missing
+from nannyml.chunk import Chunker
 from nannyml.exceptions import InvalidArgumentsException
 from nannyml.performance_calculation.metrics.base import Metric, MetricFactory, _common_data_cleaning
 from nannyml.sampling_error.multiclass_classification import (
@@ -44,7 +45,6 @@
     multiclass_confusion_matrix_sampling_error_components,
 )
 from nannyml.thresholds import Threshold, calculate_threshold_values
-from nannyml.chunk import Chunker
 
 
 @MetricFactory.register(metric='roc_auc', use_case=ProblemType.CLASSIFICATION_MULTICLASS)
@@ -674,7 +674,10 @@ def _get_components(self, classes: List[str]) -> List[Tuple[str, str]]:
         for true_class in classes:
             for pred_class in classes:
                 components.append(
-                    (f"true class: '{true_class}', predicted class: '{pred_class}'", f'true_{true_class}_pred_{pred_class}')
+                    (
+                        f"true class: '{true_class}', predicted class: '{pred_class}'",
+                        f'true_{true_class}_pred_{pred_class}',
+                    )
                 )
 
         return components

diff --git a/nannyml/performance_estimation/confidence_based/metrics.py b/nannyml/performance_estimation/confidence_based/metrics.py
@@ -11,6 +11,7 @@
 
 import abc
 import logging
+import warnings
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import numpy as np
@@ -402,6 +403,11 @@ def _realized_performance(self, data: pd.DataFrame) -> float:
         y_pred_proba, _, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized ROC-AUC.")
+            return np.NaN
+
+        if y_true.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized ROC-AUC.")
             return np.NaN
 
         return roc_auc_score(y_true, y_pred_proba)
@@ -494,6 +500,15 @@ def _realized_performance(self, data: pd.DataFrame) -> float:
         _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized F1 score.")
+            return np.NaN
+
+        if y_true.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized F1 score.")
+            return np.NaN
+
+        if y_pred.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized F1 score.")
             return np.NaN
 
         return f1_score(y_true=y_true, y_pred=y_pred)
@@ -570,6 +585,15 @@ def _realized_performance(self, data: pd.DataFrame) -> float:
         _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized precision.")
+            return np.NaN
+
+        if y_true.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized precision.")
+            return np.NaN
+
+        if y_pred.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized precision.")
             return np.NaN
 
         return precision_score(y_true=y_true, y_pred=y_pred)
@@ -644,6 +668,15 @@ def _realized_performance(self, data: pd.DataFrame) -> float:
         _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized recall.")
+            return np.NaN
+
+        if y_true.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as recall precision.")
+            return np.NaN
+
+        if y_pred.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as recall precision.")
             return np.NaN
 
         return recall_score(y_true=y_true, y_pred=y_pred)
@@ -718,10 +751,19 @@ def _realized_performance(self, data: pd.DataFrame) -> float:
         _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized specificity.")
+            return np.NaN
+
+        if y_true.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized specificity.")
+            return np.NaN
+
+        if y_pred.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized specificity.")
             return np.NaN
 
-        conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
-        return conf_matrix[1, 1] / (conf_matrix[1, 0] + conf_matrix[1, 1])
+        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
+        return tn / (tn + fp)
 
 
 def estimate_specificity(y_pred: pd.DataFrame, y_pred_proba: pd.DataFrame) -> float:
@@ -797,6 +839,15 @@ def _realized_performance(self, data: pd.DataFrame) -> float:
         _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized accuracy.")
+            return np.NaN
+
+        if y_true.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized accuracy.")
+            return np.NaN
+
+        if y_pred.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized accuracy.")
             return np.NaN
 
         return accuracy_score(y_true=y_true, y_pred=y_pred)
@@ -961,6 +1012,15 @@ def _true_positive_realized_performance(self, data: pd.DataFrame) -> float:
         _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized confusion matrix.")
+            return np.NaN
+
+        if y_true.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized confusion matrix.")
+            return np.NaN
+
+        if y_pred.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized confusion matrix.")
             return np.NaN
 
         num_tp = np.sum(np.logical_and(y_pred, y_true))
@@ -980,6 +1040,7 @@ def _true_negative_realized_performance(self, data: pd.DataFrame) -> float:
         _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized confusion matrix.")
             return np.NaN
 
         num_tn = np.sum(np.logical_and(np.logical_not(y_pred), np.logical_not(y_true)))
@@ -999,6 +1060,15 @@ def _false_positive_realized_performance(self, data: pd.DataFrame) -> float:
         _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized confusion matrix.")
+            return np.NaN
+
+        if y_true.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized confusion matrix.")
+            return np.NaN
+
+        if y_pred.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized confusion matrix.")
             return np.NaN
 
         num_tp = np.sum(np.logical_and(y_pred, y_true))
@@ -1018,6 +1088,15 @@ def _false_negative_realized_performance(self, data: pd.DataFrame) -> float:
         _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized confusion matrix.")
+            return np.NaN
+
+        if y_true.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized confusion matrix.")
+            return np.NaN
+
+        if y_pred.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized confusion matrix.")
             return np.NaN
 
         num_tp = np.sum(np.logical_and(y_pred, y_true))
@@ -1500,6 +1579,15 @@ def _realized_performance(self, data: pd.DataFrame) -> float:
         _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized business value.")
+            return np.NaN
+
+        if y_true.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized business value.")
+            return np.NaN
+
+        if y_pred.nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized business value.")
             return np.NaN
 
         tp_value = self.business_value_matrix[1, 1]
@@ -1677,7 +1765,13 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
 
     def _realized_performance(self, data: pd.DataFrame) -> float:
         data = self._ensure_targets(data)
+
         if data is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized ROC-AUC.")
+            return np.NaN
+
+        if data[self.y_true].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized ROC-AUC.")
             return np.NaN
 
         _, y_pred_probas, labels = _get_multiclass_uncalibrated_predictions(data, self.y_pred, self.y_pred_proba)
@@ -1734,7 +1828,17 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
 
     def _realized_performance(self, data: pd.DataFrame) -> float:
         data = self._ensure_targets(data)
+
         if data is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized F1 score.")
+            return np.NaN
+
+        if data[self.y_true].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized F1 score.")
+            return np.NaN
+
+        if data[self.y_pred].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized F1 score.")
             return np.NaN
 
         y_pred, _, labels = _get_multiclass_uncalibrated_predictions(data, self.y_pred, self.y_pred_proba)
@@ -1791,7 +1895,17 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
 
     def _realized_performance(self, data: pd.DataFrame) -> float:
         data = self._ensure_targets(data)
+
         if data is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized precision.")
+            return np.NaN
+
+        if data[self.y_true].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized precision.")
+            return np.NaN
+
+        if data[self.y_pred].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized precision.")
             return np.NaN
 
         y_pred, _, labels = _get_multiclass_uncalibrated_predictions(data, self.y_pred, self.y_pred_proba)
@@ -1848,7 +1962,17 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
 
     def _realized_performance(self, data: pd.DataFrame) -> float:
         data = self._ensure_targets(data)
+
         if data is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized recall.")
+            return np.NaN
+
+        if data[self.y_true].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized recall.")
+            return np.NaN
+
+        if data[self.y_pred].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized recall.")
             return np.NaN
 
         y_pred, _, labels = _get_multiclass_uncalibrated_predictions(data, self.y_pred, self.y_pred_proba)
@@ -1905,7 +2029,17 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
 
     def _realized_performance(self, data: pd.DataFrame) -> float:
         data = self._ensure_targets(data)
+
         if data is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized specificity.")
+            return np.NaN
+
+        if data[self.y_true].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized specificity.")
+            return np.NaN
+
+        if data[self.y_pred].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized specificity.")
             return np.NaN
 
         y_pred, _, labels = _get_multiclass_uncalibrated_predictions(data, self.y_pred, self.y_pred_proba)
@@ -1964,8 +2098,19 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
 
     def _realized_performance(self, data: pd.DataFrame) -> float:
         data = self._ensure_targets(data)
+
         if data is None:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized accuracy.")
             return np.NaN
+
+        if data[self.y_true].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized accuracy.")
+            return np.NaN
+
+        if data[self.y_pred].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized accuracy.")
+            return np.NaN
+
         y_pred, _, _ = _get_multiclass_uncalibrated_predictions(data, self.y_pred, self.y_pred_proba)
         return accuracy_score(data[self.y_true], y_pred)
 
@@ -2011,7 +2156,10 @@ def _get_components(self, classes: List[str]) -> List[Tuple[str, str]]:
         for true_class in classes:
             for pred_class in classes:
                 components.append(
-                    (f"true class: '{true_class}', predicted class: '{pred_class}'", f'true_{true_class}_pred_{pred_class}')
+                    (
+                        f"true class: '{true_class}', predicted class: '{pred_class}'",
+                        f'true_{true_class}_pred_{pred_class}',
+                    )
                 )
 
         return components
@@ -2074,8 +2222,16 @@ def _multiclass_confusion_matrix_alert_thresholds(
         return alert_thresholds
 
     def _multi_class_confusion_matrix_realized_performance(self, data: pd.DataFrame) -> Union[np.ndarray, float]:
+        if data is None or self.y_true not in data.columns:
+            warnings.warn("No 'y_true' values given for chunk, returning NaN as realized precision.")
+            return np.NaN
+
+        if data[self.y_true].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_true', returning NaN as realized precision.")
+            return np.NaN
 
-        if self.y_true not in data.columns or data[self.y_true].isna().all():
+        if data[self.y_pred].nunique() <= 1:
+            warnings.warn("Too few unique values present in 'y_pred', returning NaN as realized precision.")
             return np.NaN
 
         cm = confusion_matrix(