Skip to content

Commit

Permalink
1.6x performance improvement
Browse files Browse the repository at this point in the history
moved expected entropy to score function for better sklearn compatibility
  • Loading branch information
Stefan Heid committed Aug 9, 2023
1 parent f3053ca commit 9991a4d
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 13 deletions.
4 changes: 2 additions & 2 deletions experiments/performance.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@
"output_type": "stream",
"text": [
"0.9604\n",
"CPU times: user 136 ms, sys: 119 ms, total: 255 ms\n",
"Wall time: 17 s\n"
"CPU times: user 157 ms, sys: 108 ms, total: 266 ms\n",
"Wall time: 18 s\n"
]
}
],
Expand Down
34 changes: 23 additions & 11 deletions skpsl/probabilistic_scoring_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,10 @@ def __init__(self, features, scores):

self.logger = logging.getLogger(__name__)
self.scores_vec = np.array(scores)
self.probabilities = {}
self.entropy = None
self.probabilities = None

def fit(self, X, y) -> "_ClassifierAtK":
scores = self._scores_per_record(X)
n = scores.size

# compute all possible total scores using subset-summation
total_scores = {0}
Expand All @@ -38,27 +36,41 @@ def fit(self, X, y) -> "_ClassifierAtK":
calibrator.fit(scores, y)
self.probabilities = {T: p for T, p in zip(total_scores, calibrator.transform(total_scores))}

# TODO. this should actually be inside of a score function. the actual fitting is finished at this point
total_scores, score_freqs = np.unique(scores, return_counts=True)
score_probas = np.array([self.probabilities[ti] for ti in total_scores])
entropy_values = stats_entropy([score_probas, 1 - score_probas], base=2)
self.entropy = np.sum((score_freqs / n) * entropy_values)

return self

def predict(self, X):
if self.probabilities is None:
raise NotFittedError()
return self.predict_proba(X).argmax(axis=1)

def predict_proba(self, X):
"""Predicts the probability for
"""
if self.probabilities is None:
raise NotFittedError()
scores = self._scores_per_record(X)
proba_true = np.empty_like(scores, dtype=float)
for total_score in np.unique(scores):
proba_true[scores == total_score] = self.probabilities[total_score]
proba = np.vstack([1 - proba_true, proba_true]).T
return proba

def score(self, X, y=None, sample_weight=None):
"""
Calculates the expected entropy of the fitted model
:param X:
:param y:
:param sample_weight:
:return:
"""
if self.probabilities is None:
raise NotFittedError()
scores = self._scores_per_record(X)
total_scores, score_freqs = np.unique(scores, return_counts=True)
score_probas = np.array([self.probabilities[ti] for ti in total_scores])
entropy_values = stats_entropy([score_probas, 1 - score_probas], base=2)
return np.sum((score_freqs / scores.size) * entropy_values)

# Helper functions
def _scores_per_record(self, X):
return X[:, self.features] @ self.scores_vec
Expand Down Expand Up @@ -119,7 +131,7 @@ def fit(self, X, y, l=1, n_jobs=1, predef_features=None, predef_scores=None) ->
curr_stage_clf = self._stage_clf(features=[], scores=[])
curr_stage_clf.fit(X, y)
self.stage_clfs.append(curr_stage_clf)
expected_entropy = curr_stage_clf.entropy
expected_entropy = curr_stage_clf.score(X)

while remaining_features and expected_entropy > self.entropy_threshold:
stage += 1
Expand Down Expand Up @@ -170,7 +182,7 @@ def predict_proba(self, X, k=-1):
@staticmethod
def _optimize(features, feature_extension, scores, score_extension, clfcls, X, y):
clf = clfcls(features=features + feature_extension, scores=scores + score_extension).fit(X, y)
return clf, clf.entropy, feature_extension[0], score_extension[0]
return clf, clf.score(X), feature_extension[0], score_extension[0]

@staticmethod
def _gen_lookahead(list_, lookahead):
Expand Down

0 comments on commit 9991a4d

Please sign in to comment.