closes #3: added simple inspection method

TRR318 · Aug 10, 2023 · e86a2bf · e86a2bf
1 parent 0915684
commit e86a2bf
Show file tree

Hide file tree

Showing 3 changed files with 249 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -10,15 +10,32 @@ pip install scikit-psl
 ```
 
 # Usage
+
 ```python
-from skpsl import ProbabilisticScoringList
 from sklearn.datasets import make_classification
-from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import ShuffleSplit
+
+from skpsl import ProbabilisticScoringList
 
 # Generating synthetic data with continuous features and a binary target variable
 X, y = make_classification(random_state=42)
 X = (X > .5).astype(int)
 
 clf = ProbabilisticScoringList([-1, 1, 2])
-print(cross_val_score(clf, X, y, cv=5))
+
+for train, test in ShuffleSplit(1, test_size=.2, random_state=42).split(X):
+    psl = ProbabilisticScoringList([-1, 1, 2])
+    psl.fit(X[train], y[train])
+    print(f"Brier score: {psl.score(X[test], y[test]):.4f}")
+    #>  Brier score: 0.1924  (lower is better)
+
+    df = psl.inspect(5)
+    print(df.to_string(index=False, na_rep="-", justify="center", float_format=lambda x: f"{x:.2f}"))    
+    #>  Stage  Score  T = -3  T = -2  T = -1  T = 0  T = 1  T = 2  T = 3
+    #>   0        -       -       -       -   0.54      -      -      - 
+    #>   1     2.00       -       -       -   0.18      -   0.97      - 
+    #>   2    -1.00       -       -    0.00   0.28   0.91   1.00      - 
+    #>   3    -1.00       -    0.00    0.07   0.86   0.91   1.00      - 
+    #>   4     1.00       -    0.00    0.00   0.29   0.92   1.00   1.00 
+    #>   5    -1.00    0.00    0.00    0.00   0.40   1.00   1.00   1.00
 ```
diff --git a/scratch/psl_describe.ipynb b/scratch/psl_describe.ipynb
@@ -0,0 +1,203 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "adc6c663-b073-4451-9821-216c578bbd69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from skpsl import ProbabilisticScoringList\n",
+    "from sklearn.datasets import make_classification\n",
+    "from sklearn.model_selection import cross_val_score, ShuffleSplit\n",
+    "from functools import reduce\n",
+    "from operator import or_\n",
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "764414e5-261c-4a77-b14b-2235472b9baf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Brier score: 0.1924\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Generating synthetic data with continuous features and a binary target variable\n",
+    "X, y = make_classification(random_state=42)\n",
+    "X = (X > .5).astype(int)\n",
+    "\n",
+    "for train, test in ShuffleSplit(1, test_size=.2, random_state=42).split(X):\n",
+    "    psl = ProbabilisticScoringList([-1, 1, 2])\n",
+    "    psl.fit(X[train], y[train])\n",
+    "    print(f\"Brier score: {psl.score(X[test], y[test]):.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "26a044d2-d3a1-43eb-bbef-b8e7050ce568",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Stage</th>\n",
+       "      <th>Score</th>\n",
+       "      <th>T = -3</th>\n",
+       "      <th>T = -2</th>\n",
+       "      <th>T = -1</th>\n",
+       "      <th>T = 0</th>\n",
+       "      <th>T = 1</th>\n",
+       "      <th>T = 2</th>\n",
+       "      <th>T = 3</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>0.5375</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>0.1818</td>\n",
+       "      <td>-</td>\n",
+       "      <td>0.9722</td>\n",
+       "      <td>-</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.2759</td>\n",
+       "      <td>0.9091</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>-</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.069</td>\n",
+       "      <td>0.8571</td>\n",
+       "      <td>0.9091</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>-</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>-</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.2857</td>\n",
+       "      <td>0.9167</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>5</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.4000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Stage Score T = -3 T = -2 T = -1   T = 0   T = 1   T = 2 T = 3\n",
+       "0      0     -      -      -      -  0.5375       -       -     -\n",
+       "1      1   2.0      -      -      -  0.1818       -  0.9722     -\n",
+       "2      2  -1.0      -      -    0.0  0.2759  0.9091     1.0     -\n",
+       "3      3  -1.0      -    0.0  0.069  0.8571  0.9091     1.0     -\n",
+       "4      4   1.0      -    0.0    0.0  0.2857  0.9167     1.0   1.0\n",
+       "5      5  -1.0    0.0    0.0    0.0  0.4000     1.0     1.0   1.0"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = psl.inspect(5)\n",
+    "\n",
+    "pd.set_option(\"display.precision\", 4)\n",
+    "df.fillna(\"-\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/skpsl/probabilistic_scoring_list.py b/skpsl/probabilistic_scoring_list.py
@@ -3,6 +3,7 @@
 from typing import List
 
 import numpy as np
+import pandas as pd
 from joblib import Parallel, delayed
 from scipy.stats import entropy as stats_entropy
 from sklearn.base import BaseEstimator, ClassifierMixin
@@ -105,7 +106,7 @@ def fit(self, X, y, l=1, n_jobs=1, predef_features=None, predef_scores=None) ->
         :param n_jobs: passed to joblib for parallelization
         :param predef_features:
         :param predef_scores:
-        :return:
+        :return: The fitted classifier
         """
 
         number_features = X.shape[1]
@@ -162,15 +163,36 @@ def predict_proba(self, X, k=-1):
 
         return self.stage_clfs[k].predict_proba(X)
 
-    def score(self, X, y, sample_weight=None):
+    def score(self, X, y, k=-1, sample_weight=None):
         """
         Calculates the Brier score of the model
         :param X:
         :param y:
-        :param sample_weight:
+        :param k: Classifier stage to use for prediction
+        :param sample_weight: ignored
+        :return:
+        """
+        return brier_score_loss(y, self.predict_proba(X, k=k)[:, 1])
+
+    def inspect(self, k=None, feature_names=None) -> pd.DataFrame:
+        """
+        Returns a dataframe that visualizes the internal model
+
+        :param k: maximum stage to include in the visualization (default: all stages)
+        :param feature_names: names of the features.
         :return:
         """
-        return brier_score_loss(y, self.predict_proba(X)[:, 1])
+        k = k or len(self.stage_clfs) - 1
+
+        pmfs = [clf.probabilities for clf in self.stage_clfs[:k + 1]]
+        all_total_scores = sorted(set.union(*[set(pmf.keys()) for pmf in pmfs]))
+        data = [[pmfs[i].get(t_, np.nan) for t_ in all_total_scores] for i in range(k + 1)]
+
+        df = pd.DataFrame(columns=[f"T = {t_}" for t_ in all_total_scores], data=data)
+        df.insert(0, "Score", [np.nan] + self.stage_clfs[k].scores)
+        if feature_names is not None:
+            df.insert(0, "Feature", [np.nan] + feature_names[:k] + [np.nan] * (k - len(feature_names)))
+        return df.reset_index(names=["Stage"])
 
     @property
     def features(self):