diff --git a/README.md b/README.md index a8816b6..e9466fc 100644 --- a/README.md +++ b/README.md @@ -10,15 +10,32 @@ pip install scikit-psl ``` # Usage + ```python -from skpsl import ProbabilisticScoringList from sklearn.datasets import make_classification -from sklearn.model_selection import cross_val_score +from sklearn.model_selection import ShuffleSplit + +from skpsl import ProbabilisticScoringList # Generating synthetic data with continuous features and a binary target variable X, y = make_classification(random_state=42) X = (X > .5).astype(int) clf = ProbabilisticScoringList([-1, 1, 2]) -print(cross_val_score(clf, X, y, cv=5)) + +for train, test in ShuffleSplit(1, test_size=.2, random_state=42).split(X): + psl = ProbabilisticScoringList([-1, 1, 2]) + psl.fit(X[train], y[train]) + print(f"Brier score: {psl.score(X[test], y[test]):.4f}") + #> Brier score: 0.1924 (lower is better) + + df = psl.inspect(5) + print(df.to_string(index=False, na_rep="-", justify="center", float_format=lambda x: f"{x:.2f}")) + #> Stage Score T = -3 T = -2 T = -1 T = 0 T = 1 T = 2 T = 3 + #> 0 - - - - 0.54 - - - + #> 1 2.00 - - - 0.18 - 0.97 - + #> 2 -1.00 - - 0.00 0.28 0.91 1.00 - + #> 3 -1.00 - 0.00 0.07 0.86 0.91 1.00 - + #> 4 1.00 - 0.00 0.00 0.29 0.92 1.00 1.00 + #> 5 -1.00 0.00 0.00 0.00 0.40 1.00 1.00 1.00 ``` diff --git a/scratch/psl_describe.ipynb b/scratch/psl_describe.ipynb new file mode 100644 index 0000000..2e98e3a --- /dev/null +++ b/scratch/psl_describe.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "adc6c663-b073-4451-9821-216c578bbd69", + "metadata": {}, + "outputs": [], + "source": [ + "from skpsl import ProbabilisticScoringList\n", + "from sklearn.datasets import make_classification\n", + "from sklearn.model_selection import cross_val_score, ShuffleSplit\n", + "from functools import reduce\n", + "from operator import or_\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "764414e5-261c-4a77-b14b-2235472b9baf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Brier score: 0.1924\n" + ] + } + ], + "source": [ + "# Generating synthetic data with continuous features and a binary target variable\n", + "X, y = make_classification(random_state=42)\n", + "X = (X > .5).astype(int)\n", + "\n", + "for train, test in ShuffleSplit(1, test_size=.2, random_state=42).split(X):\n", + " psl = ProbabilisticScoringList([-1, 1, 2])\n", + " psl.fit(X[train], y[train])\n", + " print(f\"Brier score: {psl.score(X[test], y[test]):.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "26a044d2-d3a1-43eb-bbef-b8e7050ce568", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StageScoreT = -3T = -2T = -1T = 0T = 1T = 2T = 3
00----0.5375---
112.0---0.1818-0.9722-
22-1.0--0.00.27590.90911.0-
33-1.0-0.00.0690.85710.90911.0-
441.0-0.00.00.28570.91671.01.0
55-1.00.00.00.00.40001.01.01.0
\n", + "
" + ], + "text/plain": [ + " Stage Score T = -3 T = -2 T = -1 T = 0 T = 1 T = 2 T = 3\n", + "0 0 - - - - 0.5375 - - -\n", + "1 1 2.0 - - - 0.1818 - 0.9722 -\n", + "2 2 -1.0 - - 0.0 0.2759 0.9091 1.0 -\n", + "3 3 -1.0 - 0.0 0.069 0.8571 0.9091 1.0 -\n", + "4 4 1.0 - 0.0 0.0 0.2857 0.9167 1.0 1.0\n", + "5 5 -1.0 0.0 0.0 0.0 0.4000 1.0 1.0 1.0" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = psl.inspect(5)\n", + "\n", + "pd.set_option(\"display.precision\", 4)\n", + "df.fillna(\"-\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/skpsl/probabilistic_scoring_list.py b/skpsl/probabilistic_scoring_list.py index f06e4f2..eb2e2d9 100644 --- a/skpsl/probabilistic_scoring_list.py +++ b/skpsl/probabilistic_scoring_list.py @@ -3,6 +3,7 @@ from typing import List import numpy as np +import pandas as pd from joblib import Parallel, delayed from scipy.stats import entropy as stats_entropy from sklearn.base import BaseEstimator, ClassifierMixin @@ -105,7 +106,7 @@ def fit(self, X, y, l=1, n_jobs=1, predef_features=None, predef_scores=None) -> :param n_jobs: passed to joblib for parallelization :param predef_features: :param predef_scores: - :return: + :return: The fitted classifier """ number_features = X.shape[1] @@ -162,15 +163,36 @@ def predict_proba(self, X, k=-1): return self.stage_clfs[k].predict_proba(X) - def score(self, X, y, sample_weight=None): + def score(self, X, y, k=-1, sample_weight=None): """ Calculates the Brier score of the model :param X: :param y: - :param sample_weight: + :param k: Classifier stage to use for prediction + :param sample_weight: ignored + :return: + """ + return brier_score_loss(y, self.predict_proba(X, k=k)[:, 1]) + + def inspect(self, k=None, feature_names=None) -> pd.DataFrame: + """ + Returns a dataframe that visualizes the internal model + + :param k: maximum stage to include in the visualization (default: all stages) + :param feature_names: names of the features. :return: """ - return brier_score_loss(y, self.predict_proba(X)[:, 1]) + k = k or len(self.stage_clfs) - 1 + + pmfs = [clf.probabilities for clf in self.stage_clfs[:k + 1]] + all_total_scores = sorted(set.union(*[set(pmf.keys()) for pmf in pmfs])) + data = [[pmfs[i].get(t_, np.nan) for t_ in all_total_scores] for i in range(k + 1)] + + df = pd.DataFrame(columns=[f"T = {t_}" for t_ in all_total_scores], data=data) + df.insert(0, "Score", [np.nan] + self.stage_clfs[k].scores) + if feature_names is not None: + df.insert(0, "Feature", [np.nan] + feature_names[:k] + [np.nan] * (k - len(feature_names))) + return df.reset_index(names=["Stage"]) @property def features(self):