Skip to content

Commit

Permalink
PSL new features
Browse files Browse the repository at this point in the history
 - parallelization
 - l-step lookahead
  • Loading branch information
Stefan Heid committed Aug 8, 2023
1 parent af3671b commit d5807b9
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 31 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]

### Added
- PSL classifier: introduced parallelization
- PSL classifier: implemented l-step lookahead

- Initial implementation of the PSl algorithm
## 0.1.0 - 2023-08-08

### Added

- Initial implementation of the PSL algorithm
87 changes: 87 additions & 0 deletions experiments/performance.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "f101aee3-ac9c-45c2-8d67-b729f22ea8b0",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_breast_cancer\n",
"from sklearn.model_selection import cross_val_score, ShuffleSplit\n",
"\n",
"from skpsl import ProbabilisticScoringList, MinEntropyBinarizer"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0b07fa87-3c3c-40d4-bca7-605cfe5052c9",
"metadata": {},
"outputs": [],
"source": [
"# Generating synthetic data with continuous features and a binary target variable\n",
"\n",
"data = load_breast_cancer()\n",
"X = MinEntropyBinarizer().fit_transform(data.data,data.target)\n",
"y = data.target"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3e77f88f-8aa7-4794-9c19-aa9fc73bbb99",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9599\n",
"CPU times: user 163 ms, sys: 96.5 ms, total: 259 ms\n",
"Wall time: 14min 11s\n"
]
}
],
"source": [
"%%time\n",
"clf = ProbabilisticScoringList([-1, 1, 2])\n",
"print(f\"{cross_val_score(clf, X, y, fit_params=dict(l=2), cv=ShuffleSplit(100, test_size=.2, random_state=42), n_jobs=-1).mean():.4f}\")"
]
},
{
"cell_type": "markdown",
"id": "eb1d191d-38b6-4f1c-8740-7101c9bd192c",
"metadata": {},
"source": [
"### 0.2.0\n",
"- l=2 14min 11s, 0.9599\n",
"- l=1 30s, 0.9604\n",
"\n",
"### 0.1.0\n",
"- 30s, 0.9604"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ python = ">=3.9,<3.13"
scikit-learn = "^1.3.0"
numpy = "^1.25.2"
scipy = "^1.11.1"
joblib = "^1.3.1"

sphinx = { version = "^7.1", optional = true }
sphinx_rtd_theme = { version = "^1.2", optional = true }
Expand Down
75 changes: 45 additions & 30 deletions skpsl/probabilistic_scoring_list.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
from itertools import combinations, product, repeat

import numpy as np
from joblib import Parallel, delayed
from scipy.stats import entropy as stats_entropy
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
Expand All @@ -12,9 +14,9 @@ class _ClassifierAtK(BaseEstimator, ClassifierMixin):
Internal class for the classifier at stage k of the probabilistic scoring list
"""

def __init__(self, scores, features):
self.scores = scores
def __init__(self, features, scores):
self.features = features
self.scores = scores

self.logger = logging.getLogger(__name__)
self.scores_vec = np.array(scores)
Expand Down Expand Up @@ -89,8 +91,17 @@ def __init__(self, score_set, entropy_threshold=-1):
self.entropy_at_k = []
self._stage_clf = _ClassifierAtK

def fit(self, X, y, predef_features=None, predef_scores=None) -> "ProbabilisticScoringList":
"""Fits a probabilistic scoring list to the given data
def fit(self, X, y, l=1, n_jobs=1, predef_features=None, predef_scores=None) -> "ProbabilisticScoringList":
"""
Fits a probabilistic scoring list to the given data
:param X:
:param y:
:param l: steps of look ahead
:param n_jobs: passed to joblib for parallelization
:param predef_features:
:param predef_scores:
:return:
"""

number_features = X.shape[1]
Expand All @@ -116,34 +127,25 @@ def fit(self, X, y, predef_features=None, predef_scores=None) -> "ProbabilisticS
while remaining_features and expected_entropy > self.entropy_threshold:
stage += 1

# try all features and possible scores
curr_stage_clf = None
fk, sk = remaining_features[0], self.sorted_score_set[-1]
current_expected_entropy = np.inf

features_to_consider = remaining_features if predef_features is None else [predef_features[stage - 1]]
scores_to_consider = self.sorted_score_set if predef_scores is None else [predef_scores[stage - 1]]

for f in features_to_consider:
cand_features = self.features + [f]
for s in scores_to_consider:
tmp_stage_clf = self._stage_clf(features=cand_features, scores=np.array(self.scores + [s]))
tmp_stage_clf.fit(X, y)
temp_expected_entropy = tmp_stage_clf.entropy

self.logger.info(f"feature {f} scores {s} entropy {temp_expected_entropy}")
if temp_expected_entropy < current_expected_entropy:
current_expected_entropy = temp_expected_entropy
fk, sk = f, s
curr_stage_clf = tmp_stage_clf

expected_entropy = current_expected_entropy
self.stage_clfs.append(curr_stage_clf)

remaining_features.remove(fk)

self.features.append(fk)
self.scores.append(sk)
clfs, entropies, f, s = zip(*Parallel(n_jobs=n_jobs)(
delayed(self._optimize)(self.features, f_seq, self.scores, list(s_seq), self._stage_clf, X, y)
for (f_seq, s_seq) in product(
self._gen_lookahead(features_to_consider, l),
# cartesian power of scores
product(*repeat(scores_to_consider, min(l, len(features_to_consider))))
)
))

i = np.argmin(entropies)

expected_entropy = entropies[i]
self.stage_clfs.append(clfs[i])
remaining_features.remove(f[i])
self.features.append(f[i])
self.scores.append(s[i])
return self

def predict(self, X, k=-1):
Expand All @@ -168,13 +170,26 @@ def predict_proba(self, X, k=-1):

return self.stage_clfs[k].predict_proba(X)

@staticmethod
def _optimize(features, feature_extension, scores, score_extension, clfcls, X, y):
clf = clfcls(features=features + feature_extension, scores=scores + score_extension).fit(X, y)
return clf, clf.entropy, feature_extension[0], score_extension[0]

@staticmethod
def _gen_lookahead(list_, lookahead):
# generate sequences of shortening lookaheads (because combinations returns empty list if len(list) < l)
combination_seqs = ([list(tup) for tup in combinations(list_, _l)] for _l in range(lookahead, 0, -1))
# get first non-empty sequence
seqs = next((seq for seq in combination_seqs if seq))
return seqs


if __name__ == '__main__':
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score

# Generating synthetic data with continuous features and a binary target variable
X, y = make_classification(n_samples=100, n_features=10, n_informative=10, n_redundant=0, random_state=42)
X, y = make_classification(random_state=42)
X = (X > .5).astype(int)

clf = ProbabilisticScoringList([-1, 1, 2])
Expand Down

0 comments on commit d5807b9

Please sign in to comment.