l-step lookahead evaluated using synthetic datasets

changed psl scoring to brier score
TRR318 · Aug 9, 2023 · a29390b · a29390b
1 parent 9991a4d
commit a29390b
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 6 deletions.
diff --git a/experiments/performance.ipynb b/experiments/performance.ipynb
@@ -37,23 +37,29 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.9604\n",
-      "CPU times: user 157 ms, sys: 108 ms, total: 266 ms\n",
-      "Wall time: 18 s\n"
+      "0.0274\n",
+      "CPU times: user 33.9 ms, sys: 87 ms, total: 121 ms\n",
+      "Wall time: 3.03 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
     "clf = ProbabilisticScoringList([-1, 1, 2])\n",
-    "print(f\"{cross_val_score(clf, X, y, fit_params=dict(l=1), cv=ShuffleSplit(100, test_size=.2, random_state=42), n_jobs=-1).mean():.4f}\")"
+    "print(f\"{cross_val_score(clf, X, y, fit_params=dict(l=1), cv=ShuffleSplit(5, test_size=.2, random_state=42), n_jobs=-1).mean():.4f}\")"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "eb1d191d-38b6-4f1c-8740-7101c9bd192c",
    "metadata": {},
    "source": [
+    "non-neg 0.2349\n",
+    "l1 0.0265\n",
+    "l2 0.0276 20\n",
+    "l2 0.0315 5\n",
+    "l3 0.0320 5\n",
+    "\n",
     "### 0.2.0\n",
     "- l=2 14min 11s, 0.9599\n",
     "- l=1 30s, 0.9604\n",

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,8 @@ python = ">=3.9,<3.13"
 scikit-learn = "^1.3.0"
 numpy = "^1.25.2"
 scipy = "^1.11.1"
-joblib = "^1.3.1"
+joblib = "^1.3.2"
+pandas = "^2.0.3"
 
 sphinx = { version = "^7.1", optional = true }
 sphinx_rtd_theme = { version = "^1.2", optional = true  }

diff --git a/skpsl/data/__init__.py b/skpsl/data/__init__.py
diff --git a/skpsl/data/dataset.py b/skpsl/data/dataset.py
@@ -0,0 +1,52 @@
+import numpy as np
+from itertools import permutations
+
+from sklearn.model_selection import ShuffleSplit
+
+from skpsl.probabilistic_scoring_list import _ClassifierAtK
+from skpsl import ProbabilisticScoringList
+
+
+def lookahead_example():
+    """
+               scores→ [2,1]   [1,2]
+    (2, 5, 1, 4, 3)   0.7839  0.7783 l2_psl.scores=[1, 2] l2_psl.stage_clfs[-1].score(X_)=0.7783  score_l1=0.1944 score_l2=0.1926
+    """
+    psl = ProbabilisticScoringList(s)
+    psl.fit(X_, y_)
+    if psl.features == [0, 1] and psl.scores == [2, 1]:
+        # the psl ordered the features really in the way that feature 0 is the better one
+        # now lets test if we can improve the performance by inverting the scores
+        score, invscore = [_ClassifierAtK(features=f, scores=s_).fit(X_, y_, ).score(X_) for s_ in permutations(s)]
+        if score > invscore:
+            score_l1 = psl.score(X_, y_)
+            l2_psl = ProbabilisticScoringList(s).fit(X_, y_, l=2)
+            score_l2 = l2_psl.score(X_, y_)
+            if score_l2 < score_l1:
+                print(
+                    f"{w}   {score:.4f}  {invscore:.4f} {l2_psl.scores=} {l2_psl.stage_clfs[-1].score(X_)=}  {score_l1=} {score_l2=}")
+                print(np.corrcoef(np.hstack([X_, y_.reshape(-1, 1)].T)))
+
+
+if __name__ == '__main__':
+    X = np.array([[1, 0], [0, 1], [1, 1], [1, 0], [0, 1], [1, 1]])
+    y = np.array([0, 0, 0, 1, 1, 1])
+    f = [0, 1]
+    # s = [2, 1]
+    s = [2, 1, -1]
+
+    for w in permutations(range(1, X.shape[0] + 1)):
+        X_ = np.repeat(X, np.array(w), axis=0)
+        y_ = np.repeat(y, np.array(w), axis=0)
+
+        rs = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
+        for (train_index, test_index) in rs.split(X):
+            l1_psl = ProbabilisticScoringList(s).fit(X_[train_index], y_[train_index])
+            l1_out = l1_psl.score(X_[test_index], y_[test_index])
+            l1_in = l1_psl.score(X_[train_index], y_[train_index])
+
+            l2_psl = ProbabilisticScoringList(s).fit(X_[train_index], y_[train_index], l=2)
+            l2_out = l2_psl.score(X_[test_index], y_[test_index])
+            l2_in = l2_psl.score(X_[train_index], y_[train_index])
+            if l1_in > l2_in:
+                print(f"{l1_in=} {l2_in=} {l1_out=} {l2_out=}")
diff --git a/skpsl/probabilistic_scoring_list.py b/skpsl/probabilistic_scoring_list.py
@@ -7,6 +7,7 @@
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.exceptions import NotFittedError
 from sklearn.isotonic import IsotonicRegression
+from sklearn.metrics import brier_score_loss
 
 
 class _ClassifierAtK(BaseEstimator, ClassifierMixin):
@@ -97,7 +98,6 @@ def __init__(self, score_set, entropy_threshold=-1):
         self.total_scores_at_k = []
         self.probabilities_at_k = []
         self.stage_clfs = []
-        self.entropy_at_k = []
         self._stage_clf = _ClassifierAtK
 
     def fit(self, X, y, l=1, n_jobs=1, predef_features=None, predef_scores=None) -> "ProbabilisticScoringList":
@@ -179,6 +179,16 @@ def predict_proba(self, X, k=-1):
 
         return self.stage_clfs[k].predict_proba(X)
 
+    def score(self, X, y, sample_weight=None):
+        """
+        Calculates the Brier score of the model
+        :param X:
+        :param y:
+        :param sample_weight:
+        :return:
+        """
+        return brier_score_loss(y, self.predict_proba(X)[:, 1])
+
     @staticmethod
     def _optimize(features, feature_extension, scores, score_extension, clfcls, X, y):
         clf = clfcls(features=features + feature_extension, scores=scores + score_extension).fit(X, y)