From ba38b79506fe0b05ca7d02793e4be1c75d6d4594 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Sat, 9 Sep 2017 04:39:56 -0400 Subject: [PATCH] best and parsimonious features for sfs (#240) --- docs/sources/CHANGELOG.md | 5 +- .../SequentialFeatureSelector.ipynb | 17 +++++- .../sequential_feature_selector.py | 52 +++++++++++++++---- .../tests/test_sequential_feature_selector.py | 42 +++++++++++---- 4 files changed, 94 insertions(+), 22 deletions(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 1f3ed638e..bfad492cf 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -17,12 +17,13 @@ The CHANGELOG for the current development version is available at ##### New Features - Added a `mlxtend.evaluate.bootstrap` that implements the ordinary nonparametric bootstrap to bootstrap a single statistic (for example, the mean. median, R^2 of a regression fit, and so forth) [#232](https://github.com/rasbt/mlxtend/pull/232) +- `SequentialFeatureSelecor`'s `k_features` now accepts a string argument "best" or "parsimonious" for more "automated" feature selection. For instance, if "best" is provided, the feature selector will return the feature subset with the best cross-validation performance. If "parsimonious" is provided as an argument, the smallest feature subset that is within one standard error of the cross-validation performance will be selected. [#238](https://github.com/rasbt/mlxtend/pull/238) ##### Changes -- `SequentialFeatureSelector` now uses `np.nanmean` over normal mean to support scorers that may return `np.nan` [#211](https://github.com/rasbt/mlxtend/pull/211), via [mrkaiser](https://github.com/mrkaiser)) +- `SequentialFeatureSelector` now uses `np.nanmean` over normal mean to support scorers that may return `np.nan` [#211](https://github.com/rasbt/mlxtend/pull/211) (via [mrkaiser](https://github.com/mrkaiser)) - The `skip_if_stuck` parameter was removed from `SequentialFeatureSelector` in favor of a more efficient implementation comparing the conditional inclusion/exclusion results (in the floating versions) to the performances of previously sampled feature sets that were cached [#237](https://github.com/rasbt/mlxtend/pull/237) -- `ExhaustiveFeatureSelector` was modified to consume substantially less memory [#195](https://github.com/rasbt/mlxtend/pull/195), via [Adam Erickson](https://github.com/adam-erickson)) +- `ExhaustiveFeatureSelector` was modified to consume substantially less memory [#195](https://github.com/rasbt/mlxtend/pull/195) (via [Adam Erickson](https://github.com/adam-erickson)) ##### Bug Fixes diff --git a/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb b/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb index b2a01b667..6aba0d3b4 100644 --- a/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb +++ b/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb @@ -1451,7 +1451,7 @@ "- `estimator` : scikit-learn classifier or regressor\n", "\n", "\n", - "- `k_features` : int or tuple (new in 0.4.2) (default: 1)\n", + "- `k_features` : int or tuple or str (default: 1)\n", "\n", " Number of features to select,\n", " where k_features < the full feature set.\n", @@ -1460,6 +1460,12 @@ " min and max that scored highest in cross-validtion. For example,\n", " the tuple (1, 4) will return any combination from\n", " 1 up to 4 features instead of a fixed number of features k.\n", + " New in 0.8.0: A string argument \"best\" or \"parsimonious\".\n", + " If \"best\" is provided, the feature selector will return the\n", + " feature subset with the best cross-validation performance.\n", + " If \"parsimonious\" is provided as an argument, the smallest\n", + " feature subset that is within one standard error of the\n", + " cross-validation performance will be selected.\n", "\n", "- `forward` : bool (default: True)\n", "\n", @@ -1670,6 +1676,15 @@ " s = f.read()\n", "print(s)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py index 6d7044262..54aec9ce2 100644 --- a/mlxtend/feature_selection/sequential_feature_selector.py +++ b/mlxtend/feature_selection/sequential_feature_selector.py @@ -43,7 +43,7 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin): Parameters ---------- estimator : scikit-learn classifier or regressor - k_features : int or tuple (new in 0.4.2) (default: 1) + k_features : int or tuple or str (default: 1) Number of features to select, where k_features < the full feature set. New in 0.4.2: A tuple containing a min and max value can be provided, @@ -51,6 +51,12 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin): min and max that scored highest in cross-validtion. For example, the tuple (1, 4) will return any combination from 1 up to 4 features instead of a fixed number of features k. + New in 0.8.0: A string argument "best" or "parsimonious". + If "best" is provided, the feature selector will return the + feature subset with the best cross-validation performance. + If "parsimonious" is provided as an argument, the smallest + feature subset that is within one standard error of the + cross-validation performance will be selected. forward : bool (default: True) Forward selection if True, backward selection otherwise @@ -179,11 +185,11 @@ def fit(self, X, y): self : object """ - if not isinstance(self.k_features, int) and\ - not isinstance(self.k_features, tuple): - raise AttributeError('k_features must be a positive integer' - ' or tuple') + not isinstance(self.k_features, tuple)\ + and not isinstance(self.k_features, str): + raise AttributeError('k_features must be a positive integer' + ', tuple, or string') if isinstance(self.k_features, int) and (self.k_features < 1 or self.k_features > X.shape[1]): @@ -208,8 +214,22 @@ def fit(self, X, y): raise AttributeError('The min k_features value must be smaller' ' than the max k_features value.') - if isinstance(self.k_features, tuple): + if isinstance(self.k_features, tuple) or\ + isinstance(self.k_features, str): + select_in_range = True + + if isinstance(self.k_features, str): + if self.k_features not in {'best', 'parsimonious'}: + raise AttributeError('If a string argument is provided, ' + 'it must be "best" or "parsimonious"') + else: + min_k = 1 + max_k = X.shape[1] + else: + min_k = self.k_features[0] + max_k = self.k_features[1] + else: select_in_range = False k_to_select = self.k_features @@ -218,12 +238,12 @@ def fit(self, X, y): orig_set = set(range(X.shape[1])) if self.forward: if select_in_range: - k_to_select = self.k_features[1] + k_to_select = max_k k_idx = () k = 0 else: if select_in_range: - k_to_select = self.k_features[0] + k_to_select = min_k k_idx = tuple(range(X.shape[1])) k = len(k_idx) k_idx, k_score = _calc_score(self, X, y, k_idx) @@ -318,9 +338,11 @@ def fit(self, X, y): sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...') if select_in_range: + max_score = float('-inf') + max_score = float('-inf') for k in self.subsets_: - if k < self.k_features[0] or k > self.k_features[1]: + if k < min_k or k > max_k: continue if self.subsets_[k]['avg_score'] > max_score: max_score = self.subsets_[k]['avg_score'] @@ -328,6 +350,18 @@ def fit(self, X, y): k_score = max_score k_idx = self.subsets_[best_subset]['feature_idx'] + if self.k_features == 'parsimonious': + for k in self.subsets_: + if k >= best_subset: + continue + if self.subsets_[k]['avg_score'] >= ( + max_score - np.std(self.subsets_[k]['cv_scores']) / + self.subsets_[k]['cv_scores'].shape[0]): + max_score = self.subsets_[k]['avg_score'] + best_subset = k + k_score = max_score + k_idx = self.subsets_[best_subset]['feature_idx'] + self.k_feature_idx_ = k_idx self.k_score_ = k_score self.subsets_plus_ = dict() diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py index 73cfcc53e..1e3b00ceb 100644 --- a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py +++ b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py @@ -83,10 +83,10 @@ def test_kfeatures_type_2(): X = iris.data y = iris.target knn = KNeighborsClassifier() - expect = 'k_features must be a positive integer or tuple' + expect = 'k_features must be a positive integer, tuple, or string' sfs = SFS(estimator=knn, verbose=0, - k_features='abc') + k_features=set()) assert_raises(AttributeError, expect, sfs.fit, @@ -458,10 +458,6 @@ def test_regression_in_range(): def test_clone_params_fail(): - iris = load_iris() - X = iris.data - y = iris.target - if sys.version_info >= (3, 0): objtype = 'class' else: @@ -504,7 +500,6 @@ def test_clone_params_pass(): def test_transform_not_fitted(): iris = load_iris() X = iris.data - y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, @@ -525,9 +520,6 @@ def test_transform_not_fitted(): def test_get_metric_dict_not_fitted(): - iris = load_iris() - X = iris.data - y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, @@ -642,3 +634,33 @@ def test_max_feature_subset_size_in_tuple_range(): sfs = sfs.fit(X, y) assert len(sfs.k_feature_idx_) == 5 + + +def test_max_feature_subset_best(): + boston = load_boston() + X, y = boston.data, boston.target + lr = LinearRegression() + + sfs = SFS(lr, + k_features='best', + forward=True, + floating=False, + cv=10) + + sfs = sfs.fit(X, y) + assert sfs.k_feature_idx_ == (1, 3, 5, 7, 8, 9, 10, 11, 12) + + +def test_max_feature_subset_parsimonious(): + boston = load_boston() + X, y = boston.data, boston.target + lr = LinearRegression() + + sfs = SFS(lr, + k_features='parsimonious', + forward=True, + floating=False, + cv=10) + + sfs = sfs.fit(X, y) + assert sfs.k_feature_idx_ == (10, 11, 12, 5)