Skip to content
This repository has been archived by the owner on Dec 6, 2023. It is now read-only.

Support for max_features #27

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 50 additions & 10 deletions stability_selection/stability_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ def _bootstrap_generator(n_bootstrap_iterations, bootstrap_func, y,


def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
threshold=None):
threshold=None, max_features=None):
"""
Fits base_estimator on a bootstrap sample of the original data,
and returns a mas of the variables that are selected by the fitted model.
and returns a mask of the variables that are selected by the fitted model.

Parameters
----------
Expand Down Expand Up @@ -95,6 +95,11 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
or implicitly (e.g, Lasso), the threshold used is 1e-5.
Otherwise, "mean" is used by default.

max_features : int or None, optional
The maximum number of features selected scoring above ``threshold``.
To disable ``threshold`` and only select based on ``max_features``,
set ``threshold=-np.inf``.

Returns
-------
selected_variables : array-like, shape = [n_features]
Expand All @@ -108,6 +113,7 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
selector_model = _return_estimator_from_pipeline(base_estimator)
variable_selector = SelectFromModel(estimator=selector_model,
threshold=threshold,
max_features=max_features,
prefit=True)
return variable_selector.get_support()

Expand Down Expand Up @@ -187,6 +193,9 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
threshold : float.
Threshold defining the minimum cutoff value for the stability scores.

max_features : int or None, optional
The maximum number of features selected scoring above ``threshold``.

bootstrap_func : str or callable fun (default=bootstrap_without_replacement)
The function used to subsample the data. This parameter can be:
- A string, which must be one of
Expand All @@ -208,6 +217,11 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
or implicitly (e.g, Lasso), the threshold used is 1e-5.
Otherwise, "mean" is used by default.

bootstrap_max_features : int or None, optional
The maximum number of features selected scoring above ``bootstrap_threshold``.
To disable ``bootstrap_threshold`` and only select based on ``max_features``,
set ``bootstrap_threshold=-np.inf``.

verbose : integer.
Controls the verbosity: the higher, the more messages.

Expand Down Expand Up @@ -253,19 +267,24 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
of the Royal Statistical Society: Series B (Statistical Methodology),
75(1), pp.55-80.
"""
def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name='C',
lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100,
sample_fraction=0.5, threshold=0.6, bootstrap_func=bootstrap_without_replacement,
bootstrap_threshold=None, verbose=0, n_jobs=1, pre_dispatch='2*n_jobs',

def __init__(self, base_estimator=LogisticRegression(penalty='l1', solver='liblinear'),
lambda_name='C', lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100,
sample_fraction=0.5, threshold=0.6, max_features=None,
bootstrap_func=bootstrap_without_replacement,
bootstrap_threshold=None, bootstrap_max_features=None,
verbose=0, n_jobs=1, pre_dispatch='2*n_jobs',
random_state=None):
self.base_estimator = base_estimator
self.lambda_name = lambda_name
self.lambda_grid = lambda_grid
self.n_bootstrap_iterations = n_bootstrap_iterations
self.sample_fraction = sample_fraction
self.threshold = threshold
self.max_features = max_features
self.bootstrap_func = bootstrap_func
self.bootstrap_threshold = bootstrap_threshold
self.bootstrap_max_features = bootstrap_max_features
self.verbose = verbose
self.n_jobs = n_jobs
self.pre_dispatch = pre_dispatch
Expand All @@ -282,6 +301,9 @@ def _validate_input(self):
if not isinstance(self.threshold, float) or not (0.0 < self.threshold <= 1.0):
raise ValueError('threshold should be a float in (0, 1], got %s' % self.threshold)

if self.max_features is not None and (not isinstance(self.max_features, int) or self.max_features < 1):
raise ValueError('max_features should be a positive int, got %s' % self.max_features)

if self.lambda_name not in self.base_estimator.get_params().keys():
raise ValueError('lambda_name is set to %s, but base_estimator %s '
'does not have a parameter '
Expand Down Expand Up @@ -340,15 +362,16 @@ def fit(self, X, y):
y=y[subsample],
lambda_name=self.lambda_name,
lambda_value=lambda_value,
threshold=self.bootstrap_threshold)
threshold=self.bootstrap_threshold,
max_features=self.bootstrap_max_features)
for subsample in bootstrap_samples)

stability_scores[:, idx] = np.vstack(selected_variables).mean(axis=0)

self.stability_scores_ = stability_scores
return self

def get_support(self, indices=False, threshold=None):
def get_support(self, indices=False, threshold=None, max_features=None):
"""Get a mask, or integer index, of the features selected

Parameters
Expand All @@ -361,6 +384,9 @@ def get_support(self, indices=False, threshold=None):
Threshold defining the minimum cutoff value for the
stability scores.

max_features : int or None, optional
The maximum number of features selected scoring above ``threshold``.

Returns
-------
support : array
Expand All @@ -377,8 +403,22 @@ def get_support(self, indices=False, threshold=None):
raise ValueError('threshold should be a float in (0, 1], '
'got %s' % self.threshold)

cutoff = self.threshold if threshold is None else threshold
mask = (self.stability_scores_.max(axis=1) > cutoff)
n_features = self.stability_scores_.shape[0]
if max_features is not None and (not isinstance(max_features, int)
or not(1 < max_features <= n_features)):
raise ValueError('max_features should be an int in [1, %s], '
'got %s' % (n_features, max_features))

threshold_cutoff = self.threshold if threshold is None else threshold
mask = (self.stability_scores_.max(axis=1) > threshold_cutoff)

max_features_cutoff = self.max_features if max_features is None else max_features
if max_features_cutoff is not None:
exceed_counts = (self.stability_scores_ > threshold_cutoff).sum(axis=1)
if max_features_cutoff < (exceed_counts > 0).sum():
feature_indices = (-exceed_counts).argsort()[:max_features_cutoff]
mask = np.zeros(n_features, dtype=np.bool)
mask[feature_indices] = True

return mask if not indices else np.where(mask)[0]

Expand Down
5 changes: 5 additions & 0 deletions stability_selection/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,8 @@ def test_sample_fraction():
@raises(ValueError)
def test_lambda_name():
StabilitySelection(lambda_name='n_estimators')._validate_input()


@raises(ValueError)
def test_max_features_zero():
StabilitySelection(max_features=0)._validate_input()
55 changes: 48 additions & 7 deletions stability_selection/tests/test_stability_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _generate_dummy_classification_data(p=1000, n=1000, k=5,
def test_stability_selection_classification():
n, p, k = 1000, 1000, 5

X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1)
selector.fit(X, y)

Expand All @@ -59,7 +59,7 @@ def test_stability_selection_classification():
def test_stability_selection_regression():
n, p, k = 500, 1000, 5

X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)

base_estimator = Pipeline([
('scaler', StandardScaler()),
Expand All @@ -81,7 +81,7 @@ def test_stability_selection_regression():
def test_with_complementary_pairs_bootstrap():
n, p, k = 500, 1000, 5

X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)

base_estimator = Pipeline([
('scaler', StandardScaler()),
Expand All @@ -104,7 +104,7 @@ def test_with_complementary_pairs_bootstrap():
def test_with_stratified_bootstrap():
n, p, k = 1000, 1000, 5

X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1,
bootstrap_func='stratified')
selector.fit(X, y)
Expand All @@ -117,7 +117,7 @@ def test_with_stratified_bootstrap():
def test_different_shape():
n, p, k = 100, 200, 5

X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)

base_estimator = Pipeline([
('scaler', StandardScaler()),
Expand All @@ -136,7 +136,7 @@ def test_different_shape():
def test_no_features():
n, p, k = 100, 200, 0

X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)

base_estimator = Pipeline([
('scaler', StandardScaler()),
Expand All @@ -154,10 +154,51 @@ def test_no_features():
np.empty(0).reshape((X.shape[0], 0)))


def test_stability_selection_max_features():
n, p, k = 1000, 1000, 5
lambda_grid=np.logspace(-5, -1, 25)

X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
selector = StabilitySelection(lambda_grid=lambda_grid, max_features=1)
selector.fit(X, y)
X_r = selector.transform(X)
assert(X_r.shape == (n, 1))

selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k)
selector.fit(X, y)
X_r = selector.transform(X)
assert(X_r.shape == (n, k))

selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k+1)
selector.fit(X, y)
X_r = selector.transform(X)
assert(X_r.shape == (n, k))


@raises(ValueError)
def test_get_support_max_features_low():
n, p, k = 500, 200, 5

X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25))
selector.fit(X, y)
selector.get_support(max_features=0)


@raises(ValueError)
def test_get_support_max_features_high():
n, p, k = 500, 200, 5

X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25))
selector.fit(X, y)
selector.get_support(max_features=p+1)


def test_stability_plot():
n, p, k = 500, 200, 5

X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)

base_estimator = Pipeline([
('scaler', StandardScaler()),
Expand Down