diff --git a/experiments/performance.ipynb b/experiments/performance.ipynb index 05252ad..4877867 100644 --- a/experiments/performance.ipynb +++ b/experiments/performance.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "id": "0b07fa87-3c3c-40d4-bca7-605cfe5052c9", "metadata": {}, "outputs": [], @@ -29,24 +29,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "3e77f88f-8aa7-4794-9c19-aa9fc73bbb99", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.9599\n", - "CPU times: user 163 ms, sys: 96.5 ms, total: 259 ms\n", - "Wall time: 14min 11s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "clf = ProbabilisticScoringList([-1, 1, 2])\n", - "print(f\"{cross_val_score(clf, X, y, fit_params=dict(l=2), cv=ShuffleSplit(100, test_size=.2, random_state=42), n_jobs=-1).mean():.4f}\")" + "print(f\"{cross_val_score(clf, X, y, fit_params=dict(l=1), cv=ShuffleSplit(100, test_size=.2, random_state=42), n_jobs=-1).mean():.4f}\")" ] }, { @@ -65,9 +55,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "skpsl", "language": "python", - "name": "python3" + "name": "skpsl" }, "language_info": { "codemirror_mode": { diff --git a/skpsl/probabilistic_scoring_list.py b/skpsl/probabilistic_scoring_list.py index 1ebe200..f96c721 100644 --- a/skpsl/probabilistic_scoring_list.py +++ b/skpsl/probabilistic_scoring_list.py @@ -20,35 +20,29 @@ def __init__(self, features, scores): self.logger = logging.getLogger(__name__) self.scores_vec = np.array(scores) - self.score_sums = {} self.probabilities = {} self.entropy = None - self.calibrator = None def fit(self, X, y) -> "_ClassifierAtK": - n = X.shape[0] - relevant_scores = self._relevant_scores(X) + scores = self._scores_per_record(X) + n = scores.size # compute all possible total scores using subset-summation - self.score_sums = {0} + total_scores = {0} for score in self.scores_vec: - self.score_sums |= {prev_sum + score for prev_sum in self.score_sums} + total_scores |= {prev_sum + score for prev_sum in total_scores} + total_scores = np.array(sorted(total_scores)) - # calibrate probabilities - self.calibrator = IsotonicRegression(y_min=0.0, y_max=1.0, increasing=True, out_of_bounds="clip") - self.calibrator.fit(relevant_scores, y) + # compute probabilities + calibrator = IsotonicRegression(y_min=0.0, y_max=1.0, increasing=True, out_of_bounds="clip") + calibrator.fit(scores, y) + self.probabilities = {T: p for T, p in zip(total_scores, calibrator.transform(total_scores))} - # compute calibrated probabilities - sigmaK = np.array(sorted(self.score_sums)) - cal_ps = self.calibrator.predict(sigmaK) - - # set calibrated probabilities - self.probabilities = {T: p for T, p in zip(sigmaK, cal_ps)} - self.entropy = 0 - for ti, pi in self.probabilities.items(): - Ni = np.count_nonzero(relevant_scores == ti) - Hi = stats_entropy([pi, 1 - pi], base=2) - self.entropy += (Ni / n) * Hi + # TODO. this should actually be inside of a score function. the actual fitting is finished at this point + total_scores, score_freqs = np.unique(scores, return_counts=True) + score_probas = np.array([self.probabilities[ti] for ti in total_scores]) + entropy_values = stats_entropy([score_probas, 1 - score_probas], base=2) + self.entropy = np.sum((score_freqs / n) * entropy_values) return self @@ -58,13 +52,16 @@ def predict(self, X): def predict_proba(self, X): """Predicts the probability for """ - proba_true = np.vectorize(self.probabilities.get)(self._relevant_scores(X)) + scores = self._scores_per_record(X) + proba_true = np.empty_like(scores, dtype=float) + for total_score, proba in self.probabilities.items(): + proba_true[scores == total_score] = proba proba = np.vstack([1 - proba_true, proba_true]).T return proba # Helper functions - def _relevant_scores(self, X): - return np.sum(X[:, self.features] * self.scores_vec, axis=1) + def _scores_per_record(self, X): + return X[:, self.features] @ self.scores_vec class ProbabilisticScoringList(BaseEstimator, ClassifierMixin):