Skip to content

Commit

Permalink
1.7x performance improvement
Browse files Browse the repository at this point in the history
  • Loading branch information
Stefan Heid committed Aug 8, 2023
1 parent 7f895fd commit 4d7b53c
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 39 deletions.
22 changes: 6 additions & 16 deletions experiments/performance.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 2,
"id": "0b07fa87-3c3c-40d4-bca7-605cfe5052c9",
"metadata": {},
"outputs": [],
Expand All @@ -29,24 +29,14 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "3e77f88f-8aa7-4794-9c19-aa9fc73bbb99",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9599\n",
"CPU times: user 163 ms, sys: 96.5 ms, total: 259 ms\n",
"Wall time: 14min 11s\n"
]
}
],
"outputs": [],
"source": [
"%%time\n",
"clf = ProbabilisticScoringList([-1, 1, 2])\n",
"print(f\"{cross_val_score(clf, X, y, fit_params=dict(l=2), cv=ShuffleSplit(100, test_size=.2, random_state=42), n_jobs=-1).mean():.4f}\")"
"print(f\"{cross_val_score(clf, X, y, fit_params=dict(l=1), cv=ShuffleSplit(100, test_size=.2, random_state=42), n_jobs=-1).mean():.4f}\")"
]
},
{
Expand All @@ -65,9 +55,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "skpsl",
"language": "python",
"name": "python3"
"name": "skpsl"
},
"language_info": {
"codemirror_mode": {
Expand Down
43 changes: 20 additions & 23 deletions skpsl/probabilistic_scoring_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,35 +20,29 @@ def __init__(self, features, scores):

self.logger = logging.getLogger(__name__)
self.scores_vec = np.array(scores)
self.score_sums = {}
self.probabilities = {}
self.entropy = None
self.calibrator = None

def fit(self, X, y) -> "_ClassifierAtK":
n = X.shape[0]
relevant_scores = self._relevant_scores(X)
scores = self._scores_per_record(X)
n = scores.size

# compute all possible total scores using subset-summation
self.score_sums = {0}
total_scores = {0}
for score in self.scores_vec:
self.score_sums |= {prev_sum + score for prev_sum in self.score_sums}
total_scores |= {prev_sum + score for prev_sum in total_scores}
total_scores = np.array(sorted(total_scores))

# calibrate probabilities
self.calibrator = IsotonicRegression(y_min=0.0, y_max=1.0, increasing=True, out_of_bounds="clip")
self.calibrator.fit(relevant_scores, y)
# compute probabilities
calibrator = IsotonicRegression(y_min=0.0, y_max=1.0, increasing=True, out_of_bounds="clip")
calibrator.fit(scores, y)
self.probabilities = {T: p for T, p in zip(total_scores, calibrator.transform(total_scores))}

# compute calibrated probabilities
sigmaK = np.array(sorted(self.score_sums))
cal_ps = self.calibrator.predict(sigmaK)

# set calibrated probabilities
self.probabilities = {T: p for T, p in zip(sigmaK, cal_ps)}
self.entropy = 0
for ti, pi in self.probabilities.items():
Ni = np.count_nonzero(relevant_scores == ti)
Hi = stats_entropy([pi, 1 - pi], base=2)
self.entropy += (Ni / n) * Hi
# TODO. this should actually be inside of a score function. the actual fitting is finished at this point
total_scores, score_freqs = np.unique(scores, return_counts=True)
score_probas = np.array([self.probabilities[ti] for ti in total_scores])
entropy_values = stats_entropy([score_probas, 1 - score_probas], base=2)
self.entropy = np.sum((score_freqs / n) * entropy_values)

return self

Expand All @@ -58,13 +52,16 @@ def predict(self, X):
def predict_proba(self, X):
"""Predicts the probability for
"""
proba_true = np.vectorize(self.probabilities.get)(self._relevant_scores(X))
scores = self._scores_per_record(X)
proba_true = np.empty_like(scores, dtype=float)
for total_score, proba in self.probabilities.items():
proba_true[scores == total_score] = proba
proba = np.vstack([1 - proba_true, proba_true]).T
return proba

# Helper functions
def _relevant_scores(self, X):
return np.sum(X[:, self.features] * self.scores_vec, axis=1)
def _scores_per_record(self, X):
return X[:, self.features] @ self.scores_vec


class ProbabilisticScoringList(BaseEstimator, ClassifierMixin):
Expand Down

0 comments on commit 4d7b53c

Please sign in to comment.