PSL new features

- parallelization - l-step lookahead
TRR318 · Aug 8, 2023 · d5807b9 · d5807b9
1 parent af3671b
commit d5807b9
Show file tree

Hide file tree

Showing 4 changed files with 140 additions and 31 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,5 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+- PSL classifier: introduced parallelization
+- PSL classifier: implemented l-step lookahead
 
-- Initial implementation of the PSl algorithm
+## 0.1.0 - 2023-08-08
+
+### Added
+
+- Initial implementation of the PSL algorithm
diff --git a/experiments/performance.ipynb b/experiments/performance.ipynb
@@ -0,0 +1,87 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f101aee3-ac9c-45c2-8d67-b729f22ea8b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import load_breast_cancer\n",
+    "from sklearn.model_selection import cross_val_score, ShuffleSplit\n",
+    "\n",
+    "from skpsl import ProbabilisticScoringList, MinEntropyBinarizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "0b07fa87-3c3c-40d4-bca7-605cfe5052c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generating synthetic data with continuous features and a binary target variable\n",
+    "\n",
+    "data = load_breast_cancer()\n",
+    "X = MinEntropyBinarizer().fit_transform(data.data,data.target)\n",
+    "y = data.target"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "3e77f88f-8aa7-4794-9c19-aa9fc73bbb99",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9599\n",
+      "CPU times: user 163 ms, sys: 96.5 ms, total: 259 ms\n",
+      "Wall time: 14min 11s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "clf = ProbabilisticScoringList([-1, 1, 2])\n",
+    "print(f\"{cross_val_score(clf, X, y, fit_params=dict(l=2), cv=ShuffleSplit(100, test_size=.2, random_state=42), n_jobs=-1).mean():.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eb1d191d-38b6-4f1c-8740-7101c9bd192c",
+   "metadata": {},
+   "source": [
+    "### 0.2.0\n",
+    "- l=2 14min 11s, 0.9599\n",
+    "- l=1 30s, 0.9604\n",
+    "\n",
+    "### 0.1.0\n",
+    "- 30s, 0.9604"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ python = ">=3.9,<3.13"
 scikit-learn = "^1.3.0"
 numpy = "^1.25.2"
 scipy = "^1.11.1"
+joblib = "^1.3.1"
 
 sphinx = { version = "^7.1", optional = true }
 sphinx_rtd_theme = { version = "^1.2", optional = true  }

diff --git a/skpsl/probabilistic_scoring_list.py b/skpsl/probabilistic_scoring_list.py
@@ -1,6 +1,8 @@
 import logging
+from itertools import combinations, product, repeat
 
 import numpy as np
+from joblib import Parallel, delayed
 from scipy.stats import entropy as stats_entropy
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.exceptions import NotFittedError
@@ -12,9 +14,9 @@ class _ClassifierAtK(BaseEstimator, ClassifierMixin):
     Internal class for the classifier at stage k of the probabilistic scoring list
     """
 
-    def __init__(self, scores, features):
-        self.scores = scores
+    def __init__(self, features, scores):
         self.features = features
+        self.scores = scores
 
         self.logger = logging.getLogger(__name__)
         self.scores_vec = np.array(scores)
@@ -89,8 +91,17 @@ def __init__(self, score_set, entropy_threshold=-1):
         self.entropy_at_k = []
         self._stage_clf = _ClassifierAtK
 
-    def fit(self, X, y, predef_features=None, predef_scores=None) -> "ProbabilisticScoringList":
-        """Fits a probabilistic scoring list to the given data
+    def fit(self, X, y, l=1, n_jobs=1, predef_features=None, predef_scores=None) -> "ProbabilisticScoringList":
+        """
+        Fits a probabilistic scoring list to the given data
+
+        :param X:
+        :param y:
+        :param l: steps of look ahead
+        :param n_jobs: passed to joblib for parallelization
+        :param predef_features:
+        :param predef_scores:
+        :return:
         """
 
         number_features = X.shape[1]
@@ -116,34 +127,25 @@ def fit(self, X, y, predef_features=None, predef_scores=None) -> "ProbabilisticS
         while remaining_features and expected_entropy > self.entropy_threshold:
             stage += 1
 
-            # try all features and possible scores
-            curr_stage_clf = None
-            fk, sk = remaining_features[0], self.sorted_score_set[-1]
-            current_expected_entropy = np.inf
-
             features_to_consider = remaining_features if predef_features is None else [predef_features[stage - 1]]
             scores_to_consider = self.sorted_score_set if predef_scores is None else [predef_scores[stage - 1]]
 
-            for f in features_to_consider:
-                cand_features = self.features + [f]
-                for s in scores_to_consider:
-                    tmp_stage_clf = self._stage_clf(features=cand_features, scores=np.array(self.scores + [s]))
-                    tmp_stage_clf.fit(X, y)
-                    temp_expected_entropy = tmp_stage_clf.entropy
-
-                    self.logger.info(f"feature {f} scores {s} entropy {temp_expected_entropy}")
-                    if temp_expected_entropy < current_expected_entropy:
-                        current_expected_entropy = temp_expected_entropy
-                        fk, sk = f, s
-                        curr_stage_clf = tmp_stage_clf
-
-            expected_entropy = current_expected_entropy
-            self.stage_clfs.append(curr_stage_clf)
-
-            remaining_features.remove(fk)
-
-            self.features.append(fk)
-            self.scores.append(sk)
+            clfs, entropies, f, s = zip(*Parallel(n_jobs=n_jobs)(
+                delayed(self._optimize)(self.features, f_seq, self.scores, list(s_seq), self._stage_clf, X, y)
+                for (f_seq, s_seq) in product(
+                    self._gen_lookahead(features_to_consider, l),
+                    # cartesian power of scores
+                    product(*repeat(scores_to_consider, min(l, len(features_to_consider))))
+                )
+            ))
+
+            i = np.argmin(entropies)
+
+            expected_entropy = entropies[i]
+            self.stage_clfs.append(clfs[i])
+            remaining_features.remove(f[i])
+            self.features.append(f[i])
+            self.scores.append(s[i])
         return self
 
     def predict(self, X, k=-1):
@@ -168,13 +170,26 @@ def predict_proba(self, X, k=-1):
 
         return self.stage_clfs[k].predict_proba(X)
 
+    @staticmethod
+    def _optimize(features, feature_extension, scores, score_extension, clfcls, X, y):
+        clf = clfcls(features=features + feature_extension, scores=scores + score_extension).fit(X, y)
+        return clf, clf.entropy, feature_extension[0], score_extension[0]
+
+    @staticmethod
+    def _gen_lookahead(list_, lookahead):
+        # generate sequences of shortening lookaheads (because combinations returns empty list if len(list) < l)
+        combination_seqs = ([list(tup) for tup in combinations(list_, _l)] for _l in range(lookahead, 0, -1))
+        # get first non-empty sequence
+        seqs = next((seq for seq in combination_seqs if seq))
+        return seqs
+
 
 if __name__ == '__main__':
     from sklearn.datasets import make_classification
     from sklearn.model_selection import cross_val_score
 
     # Generating synthetic data with continuous features and a binary target variable
-    X, y = make_classification(n_samples=100, n_features=10, n_informative=10, n_redundant=0, random_state=42)
+    X, y = make_classification(random_state=42)
     X = (X > .5).astype(int)
 
     clf = ProbabilisticScoringList([-1, 1, 2])