diff --git a/stability_selection/stability_selection.py b/stability_selection/stability_selection.py index 1f95af8..6e31bee 100644 --- a/stability_selection/stability_selection.py +++ b/stability_selection/stability_selection.py @@ -63,10 +63,10 @@ def _bootstrap_generator(n_bootstrap_iterations, bootstrap_func, y, def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value, - threshold=None): + threshold=None, max_features=None): """ Fits base_estimator on a bootstrap sample of the original data, - and returns a mas of the variables that are selected by the fitted model. + and returns a mask of the variables that are selected by the fitted model. Parameters ---------- @@ -95,6 +95,11 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value, or implicitly (e.g, Lasso), the threshold used is 1e-5. Otherwise, "mean" is used by default. + max_features : int or None, optional + The maximum number of features selected scoring above ``threshold``. + To disable ``threshold`` and only select based on ``max_features``, + set ``threshold=-np.inf``. + Returns ------- selected_variables : array-like, shape = [n_features] @@ -108,6 +113,7 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value, selector_model = _return_estimator_from_pipeline(base_estimator) variable_selector = SelectFromModel(estimator=selector_model, threshold=threshold, + max_features=max_features, prefit=True) return variable_selector.get_support() @@ -187,6 +193,9 @@ class StabilitySelection(BaseEstimator, TransformerMixin): threshold : float. Threshold defining the minimum cutoff value for the stability scores. + max_features : int or None, optional + The maximum number of features selected scoring above ``threshold``. + bootstrap_func : str or callable fun (default=bootstrap_without_replacement) The function used to subsample the data. This parameter can be: - A string, which must be one of @@ -208,6 +217,11 @@ class StabilitySelection(BaseEstimator, TransformerMixin): or implicitly (e.g, Lasso), the threshold used is 1e-5. Otherwise, "mean" is used by default. + bootstrap_max_features : int or None, optional + The maximum number of features selected scoring above ``bootstrap_threshold``. + To disable ``bootstrap_threshold`` and only select based on ``max_features``, + set ``bootstrap_threshold=-np.inf``. + verbose : integer. Controls the verbosity: the higher, the more messages. @@ -253,10 +267,13 @@ class StabilitySelection(BaseEstimator, TransformerMixin): of the Royal Statistical Society: Series B (Statistical Methodology), 75(1), pp.55-80. """ - def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name='C', - lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100, - sample_fraction=0.5, threshold=0.6, bootstrap_func=bootstrap_without_replacement, - bootstrap_threshold=None, verbose=0, n_jobs=1, pre_dispatch='2*n_jobs', + + def __init__(self, base_estimator=LogisticRegression(penalty='l1', solver='liblinear'), + lambda_name='C', lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100, + sample_fraction=0.5, threshold=0.6, max_features=None, + bootstrap_func=bootstrap_without_replacement, + bootstrap_threshold=None, bootstrap_max_features=None, + verbose=0, n_jobs=1, pre_dispatch='2*n_jobs', random_state=None): self.base_estimator = base_estimator self.lambda_name = lambda_name @@ -264,8 +281,10 @@ def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name= self.n_bootstrap_iterations = n_bootstrap_iterations self.sample_fraction = sample_fraction self.threshold = threshold + self.max_features = max_features self.bootstrap_func = bootstrap_func self.bootstrap_threshold = bootstrap_threshold + self.bootstrap_max_features = bootstrap_max_features self.verbose = verbose self.n_jobs = n_jobs self.pre_dispatch = pre_dispatch @@ -282,6 +301,9 @@ def _validate_input(self): if not isinstance(self.threshold, float) or not (0.0 < self.threshold <= 1.0): raise ValueError('threshold should be a float in (0, 1], got %s' % self.threshold) + if self.max_features is not None and (not isinstance(self.max_features, int) or self.max_features < 1): + raise ValueError('max_features should be a positive int, got %s' % self.max_features) + if self.lambda_name not in self.base_estimator.get_params().keys(): raise ValueError('lambda_name is set to %s, but base_estimator %s ' 'does not have a parameter ' @@ -340,7 +362,8 @@ def fit(self, X, y): y=y[subsample], lambda_name=self.lambda_name, lambda_value=lambda_value, - threshold=self.bootstrap_threshold) + threshold=self.bootstrap_threshold, + max_features=self.bootstrap_max_features) for subsample in bootstrap_samples) stability_scores[:, idx] = np.vstack(selected_variables).mean(axis=0) @@ -348,7 +371,7 @@ def fit(self, X, y): self.stability_scores_ = stability_scores return self - def get_support(self, indices=False, threshold=None): + def get_support(self, indices=False, threshold=None, max_features=None): """Get a mask, or integer index, of the features selected Parameters @@ -361,6 +384,9 @@ def get_support(self, indices=False, threshold=None): Threshold defining the minimum cutoff value for the stability scores. + max_features : int or None, optional + The maximum number of features selected scoring above ``threshold``. + Returns ------- support : array @@ -377,8 +403,22 @@ def get_support(self, indices=False, threshold=None): raise ValueError('threshold should be a float in (0, 1], ' 'got %s' % self.threshold) - cutoff = self.threshold if threshold is None else threshold - mask = (self.stability_scores_.max(axis=1) > cutoff) + n_features = self.stability_scores_.shape[0] + if max_features is not None and (not isinstance(max_features, int) + or not(1 < max_features <= n_features)): + raise ValueError('max_features should be an int in [1, %s], ' + 'got %s' % (n_features, max_features)) + + threshold_cutoff = self.threshold if threshold is None else threshold + mask = (self.stability_scores_.max(axis=1) > threshold_cutoff) + + max_features_cutoff = self.max_features if max_features is None else max_features + if max_features_cutoff is not None: + exceed_counts = (self.stability_scores_ > threshold_cutoff).sum(axis=1) + if max_features_cutoff < (exceed_counts > 0).sum(): + feature_indices = (-exceed_counts).argsort()[:max_features_cutoff] + mask = np.zeros(n_features, dtype=np.bool) + mask[feature_indices] = True return mask if not indices else np.where(mask)[0] diff --git a/stability_selection/tests/test_common.py b/stability_selection/tests/test_common.py index 7ef0995..b65083d 100644 --- a/stability_selection/tests/test_common.py +++ b/stability_selection/tests/test_common.py @@ -71,3 +71,8 @@ def test_sample_fraction(): @raises(ValueError) def test_lambda_name(): StabilitySelection(lambda_name='n_estimators')._validate_input() + + +@raises(ValueError) +def test_max_features_zero(): + StabilitySelection(max_features=0)._validate_input() diff --git a/stability_selection/tests/test_stability_selection.py b/stability_selection/tests/test_stability_selection.py index 4193d29..1498a7d 100644 --- a/stability_selection/tests/test_stability_selection.py +++ b/stability_selection/tests/test_stability_selection.py @@ -44,7 +44,7 @@ def _generate_dummy_classification_data(p=1000, n=1000, k=5, def test_stability_selection_classification(): n, p, k = 1000, 1000, 5 - X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) + X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k) selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1) selector.fit(X, y) @@ -59,7 +59,7 @@ def test_stability_selection_classification(): def test_stability_selection_regression(): n, p, k = 500, 1000, 5 - X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) + X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), @@ -81,7 +81,7 @@ def test_stability_selection_regression(): def test_with_complementary_pairs_bootstrap(): n, p, k = 500, 1000, 5 - X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) + X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), @@ -104,7 +104,7 @@ def test_with_complementary_pairs_bootstrap(): def test_with_stratified_bootstrap(): n, p, k = 1000, 1000, 5 - X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) + X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k) selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1, bootstrap_func='stratified') selector.fit(X, y) @@ -117,7 +117,7 @@ def test_with_stratified_bootstrap(): def test_different_shape(): n, p, k = 100, 200, 5 - X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) + X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), @@ -136,7 +136,7 @@ def test_different_shape(): def test_no_features(): n, p, k = 100, 200, 0 - X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) + X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), @@ -154,10 +154,51 @@ def test_no_features(): np.empty(0).reshape((X.shape[0], 0))) +def test_stability_selection_max_features(): + n, p, k = 1000, 1000, 5 + lambda_grid=np.logspace(-5, -1, 25) + + X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k) + selector = StabilitySelection(lambda_grid=lambda_grid, max_features=1) + selector.fit(X, y) + X_r = selector.transform(X) + assert(X_r.shape == (n, 1)) + + selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k) + selector.fit(X, y) + X_r = selector.transform(X) + assert(X_r.shape == (n, k)) + + selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k+1) + selector.fit(X, y) + X_r = selector.transform(X) + assert(X_r.shape == (n, k)) + + +@raises(ValueError) +def test_get_support_max_features_low(): + n, p, k = 500, 200, 5 + + X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k) + selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25)) + selector.fit(X, y) + selector.get_support(max_features=0) + + +@raises(ValueError) +def test_get_support_max_features_high(): + n, p, k = 500, 200, 5 + + X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k) + selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25)) + selector.fit(X, y) + selector.get_support(max_features=p+1) + + def test_stability_plot(): n, p, k = 500, 200, 5 - X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) + X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()),