diff --git a/CHANGELOG.md b/CHANGELOG.md index 11a064e..c64cc9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,5 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- PSL classifier: introduced parallelization +- PSL classifier: implemented l-step lookahead -- Initial implementation of the PSl algorithm \ No newline at end of file +## 0.1.0 - 2023-08-08 + +### Added + +- Initial implementation of the PSL algorithm \ No newline at end of file diff --git a/experiments/performance.ipynb b/experiments/performance.ipynb new file mode 100644 index 0000000..05252ad --- /dev/null +++ b/experiments/performance.ipynb @@ -0,0 +1,87 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f101aee3-ac9c-45c2-8d67-b729f22ea8b0", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_breast_cancer\n", + "from sklearn.model_selection import cross_val_score, ShuffleSplit\n", + "\n", + "from skpsl import ProbabilisticScoringList, MinEntropyBinarizer" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0b07fa87-3c3c-40d4-bca7-605cfe5052c9", + "metadata": {}, + "outputs": [], + "source": [ + "# Generating synthetic data with continuous features and a binary target variable\n", + "\n", + "data = load_breast_cancer()\n", + "X = MinEntropyBinarizer().fit_transform(data.data,data.target)\n", + "y = data.target" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3e77f88f-8aa7-4794-9c19-aa9fc73bbb99", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9599\n", + "CPU times: user 163 ms, sys: 96.5 ms, total: 259 ms\n", + "Wall time: 14min 11s\n" + ] + } + ], + "source": [ + "%%time\n", + "clf = ProbabilisticScoringList([-1, 1, 2])\n", + "print(f\"{cross_val_score(clf, X, y, fit_params=dict(l=2), cv=ShuffleSplit(100, test_size=.2, random_state=42), n_jobs=-1).mean():.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "eb1d191d-38b6-4f1c-8740-7101c9bd192c", + "metadata": {}, + "source": [ + "### 0.2.0\n", + "- l=2 14min 11s, 0.9599\n", + "- l=1 30s, 0.9604\n", + "\n", + "### 0.1.0\n", + "- 30s, 0.9604" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 67888c9..cab3ed9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ python = ">=3.9,<3.13" scikit-learn = "^1.3.0" numpy = "^1.25.2" scipy = "^1.11.1" +joblib = "^1.3.1" sphinx = { version = "^7.1", optional = true } sphinx_rtd_theme = { version = "^1.2", optional = true } diff --git a/skpsl/probabilistic_scoring_list.py b/skpsl/probabilistic_scoring_list.py index 402f11a..1ebe200 100644 --- a/skpsl/probabilistic_scoring_list.py +++ b/skpsl/probabilistic_scoring_list.py @@ -1,6 +1,8 @@ import logging +from itertools import combinations, product, repeat import numpy as np +from joblib import Parallel, delayed from scipy.stats import entropy as stats_entropy from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.exceptions import NotFittedError @@ -12,9 +14,9 @@ class _ClassifierAtK(BaseEstimator, ClassifierMixin): Internal class for the classifier at stage k of the probabilistic scoring list """ - def __init__(self, scores, features): - self.scores = scores + def __init__(self, features, scores): self.features = features + self.scores = scores self.logger = logging.getLogger(__name__) self.scores_vec = np.array(scores) @@ -89,8 +91,17 @@ def __init__(self, score_set, entropy_threshold=-1): self.entropy_at_k = [] self._stage_clf = _ClassifierAtK - def fit(self, X, y, predef_features=None, predef_scores=None) -> "ProbabilisticScoringList": - """Fits a probabilistic scoring list to the given data + def fit(self, X, y, l=1, n_jobs=1, predef_features=None, predef_scores=None) -> "ProbabilisticScoringList": + """ + Fits a probabilistic scoring list to the given data + + :param X: + :param y: + :param l: steps of look ahead + :param n_jobs: passed to joblib for parallelization + :param predef_features: + :param predef_scores: + :return: """ number_features = X.shape[1] @@ -116,34 +127,25 @@ def fit(self, X, y, predef_features=None, predef_scores=None) -> "ProbabilisticS while remaining_features and expected_entropy > self.entropy_threshold: stage += 1 - # try all features and possible scores - curr_stage_clf = None - fk, sk = remaining_features[0], self.sorted_score_set[-1] - current_expected_entropy = np.inf - features_to_consider = remaining_features if predef_features is None else [predef_features[stage - 1]] scores_to_consider = self.sorted_score_set if predef_scores is None else [predef_scores[stage - 1]] - for f in features_to_consider: - cand_features = self.features + [f] - for s in scores_to_consider: - tmp_stage_clf = self._stage_clf(features=cand_features, scores=np.array(self.scores + [s])) - tmp_stage_clf.fit(X, y) - temp_expected_entropy = tmp_stage_clf.entropy - - self.logger.info(f"feature {f} scores {s} entropy {temp_expected_entropy}") - if temp_expected_entropy < current_expected_entropy: - current_expected_entropy = temp_expected_entropy - fk, sk = f, s - curr_stage_clf = tmp_stage_clf - - expected_entropy = current_expected_entropy - self.stage_clfs.append(curr_stage_clf) - - remaining_features.remove(fk) - - self.features.append(fk) - self.scores.append(sk) + clfs, entropies, f, s = zip(*Parallel(n_jobs=n_jobs)( + delayed(self._optimize)(self.features, f_seq, self.scores, list(s_seq), self._stage_clf, X, y) + for (f_seq, s_seq) in product( + self._gen_lookahead(features_to_consider, l), + # cartesian power of scores + product(*repeat(scores_to_consider, min(l, len(features_to_consider)))) + ) + )) + + i = np.argmin(entropies) + + expected_entropy = entropies[i] + self.stage_clfs.append(clfs[i]) + remaining_features.remove(f[i]) + self.features.append(f[i]) + self.scores.append(s[i]) return self def predict(self, X, k=-1): @@ -168,13 +170,26 @@ def predict_proba(self, X, k=-1): return self.stage_clfs[k].predict_proba(X) + @staticmethod + def _optimize(features, feature_extension, scores, score_extension, clfcls, X, y): + clf = clfcls(features=features + feature_extension, scores=scores + score_extension).fit(X, y) + return clf, clf.entropy, feature_extension[0], score_extension[0] + + @staticmethod + def _gen_lookahead(list_, lookahead): + # generate sequences of shortening lookaheads (because combinations returns empty list if len(list) < l) + combination_seqs = ([list(tup) for tup in combinations(list_, _l)] for _l in range(lookahead, 0, -1)) + # get first non-empty sequence + seqs = next((seq for seq in combination_seqs if seq)) + return seqs + if __name__ == '__main__': from sklearn.datasets import make_classification from sklearn.model_selection import cross_val_score # Generating synthetic data with continuous features and a binary target variable - X, y = make_classification(n_samples=100, n_features=10, n_informative=10, n_redundant=0, random_state=42) + X, y = make_classification(random_state=42) X = (X > .5).astype(int) clf = ProbabilisticScoringList([-1, 1, 2])