Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/rasbt/mlxtend
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Sep 9, 2017
2 parents 1ca0b74 + ba38b79 commit c09c23b
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1451,7 +1451,7 @@
"- `estimator` : scikit-learn classifier or regressor\n",
"\n",
"\n",
"- `k_features` : int or tuple (new in 0.4.2) (default: 1)\n",
"- `k_features` : int or tuple or str (default: 1)\n",
"\n",
" Number of features to select,\n",
" where k_features < the full feature set.\n",
Expand All @@ -1460,6 +1460,12 @@
" min and max that scored highest in cross-validtion. For example,\n",
" the tuple (1, 4) will return any combination from\n",
" 1 up to 4 features instead of a fixed number of features k.\n",
" New in 0.8.0: A string argument \"best\" or \"parsimonious\".\n",
" If \"best\" is provided, the feature selector will return the\n",
" feature subset with the best cross-validation performance.\n",
" If \"parsimonious\" is provided as an argument, the smallest\n",
" feature subset that is within one standard error of the\n",
" cross-validation performance will be selected.\n",
"\n",
"- `forward` : bool (default: True)\n",
"\n",
Expand Down Expand Up @@ -1670,6 +1676,15 @@
" s = f.read()\n",
"print(s)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
52 changes: 43 additions & 9 deletions mlxtend/feature_selection/sequential_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,20 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
Parameters
----------
estimator : scikit-learn classifier or regressor
k_features : int or tuple (new in 0.4.2) (default: 1)
k_features : int or tuple or str (default: 1)
Number of features to select,
where k_features < the full feature set.
New in 0.4.2: A tuple containing a min and max value can be provided,
and the SFS will consider return any feature combination between
min and max that scored highest in cross-validtion. For example,
the tuple (1, 4) will return any combination from
1 up to 4 features instead of a fixed number of features k.
New in 0.8.0: A string argument "best" or "parsimonious".
If "best" is provided, the feature selector will return the
feature subset with the best cross-validation performance.
If "parsimonious" is provided as an argument, the smallest
feature subset that is within one standard error of the
cross-validation performance will be selected.
forward : bool (default: True)
Forward selection if True,
backward selection otherwise
Expand Down Expand Up @@ -179,11 +185,11 @@ def fit(self, X, y):
self : object
"""

if not isinstance(self.k_features, int) and\
not isinstance(self.k_features, tuple):
raise AttributeError('k_features must be a positive integer'
' or tuple')
not isinstance(self.k_features, tuple)\
and not isinstance(self.k_features, str):
raise AttributeError('k_features must be a positive integer'
', tuple, or string')

if isinstance(self.k_features, int) and (self.k_features < 1 or
self.k_features > X.shape[1]):
Expand All @@ -208,8 +214,22 @@ def fit(self, X, y):
raise AttributeError('The min k_features value must be smaller'
' than the max k_features value.')

if isinstance(self.k_features, tuple):
if isinstance(self.k_features, tuple) or\
isinstance(self.k_features, str):

select_in_range = True

if isinstance(self.k_features, str):
if self.k_features not in {'best', 'parsimonious'}:
raise AttributeError('If a string argument is provided, '
'it must be "best" or "parsimonious"')
else:
min_k = 1
max_k = X.shape[1]
else:
min_k = self.k_features[0]
max_k = self.k_features[1]

else:
select_in_range = False
k_to_select = self.k_features
Expand All @@ -218,12 +238,12 @@ def fit(self, X, y):
orig_set = set(range(X.shape[1]))
if self.forward:
if select_in_range:
k_to_select = self.k_features[1]
k_to_select = max_k
k_idx = ()
k = 0
else:
if select_in_range:
k_to_select = self.k_features[0]
k_to_select = min_k
k_idx = tuple(range(X.shape[1]))
k = len(k_idx)
k_idx, k_score = _calc_score(self, X, y, k_idx)
Expand Down Expand Up @@ -318,16 +338,30 @@ def fit(self, X, y):
sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...')

if select_in_range:
max_score = float('-inf')

max_score = float('-inf')
for k in self.subsets_:
if k < self.k_features[0] or k > self.k_features[1]:
if k < min_k or k > max_k:
continue
if self.subsets_[k]['avg_score'] > max_score:
max_score = self.subsets_[k]['avg_score']
best_subset = k
k_score = max_score
k_idx = self.subsets_[best_subset]['feature_idx']

if self.k_features == 'parsimonious':
for k in self.subsets_:
if k >= best_subset:
continue
if self.subsets_[k]['avg_score'] >= (
max_score - np.std(self.subsets_[k]['cv_scores']) /
self.subsets_[k]['cv_scores'].shape[0]):
max_score = self.subsets_[k]['avg_score']
best_subset = k
k_score = max_score
k_idx = self.subsets_[best_subset]['feature_idx']

self.k_feature_idx_ = k_idx
self.k_score_ = k_score
self.subsets_plus_ = dict()
Expand Down
42 changes: 32 additions & 10 deletions mlxtend/feature_selection/tests/test_sequential_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,10 @@ def test_kfeatures_type_2():
X = iris.data
y = iris.target
knn = KNeighborsClassifier()
expect = 'k_features must be a positive integer or tuple'
expect = 'k_features must be a positive integer, tuple, or string'
sfs = SFS(estimator=knn,
verbose=0,
k_features='abc')
k_features=set())
assert_raises(AttributeError,
expect,
sfs.fit,
Expand Down Expand Up @@ -458,10 +458,6 @@ def test_regression_in_range():


def test_clone_params_fail():
iris = load_iris()
X = iris.data
y = iris.target

if sys.version_info >= (3, 0):
objtype = 'class'
else:
Expand Down Expand Up @@ -504,7 +500,6 @@ def test_clone_params_pass():
def test_transform_not_fitted():
iris = load_iris()
X = iris.data
y = iris.target
knn = KNeighborsClassifier(n_neighbors=4)

sfs1 = SFS(knn,
Expand All @@ -525,9 +520,6 @@ def test_transform_not_fitted():


def test_get_metric_dict_not_fitted():
iris = load_iris()
X = iris.data
y = iris.target
knn = KNeighborsClassifier(n_neighbors=4)

sfs1 = SFS(knn,
Expand Down Expand Up @@ -642,3 +634,33 @@ def test_max_feature_subset_size_in_tuple_range():

sfs = sfs.fit(X, y)
assert len(sfs.k_feature_idx_) == 5


def test_max_feature_subset_best():
boston = load_boston()
X, y = boston.data, boston.target
lr = LinearRegression()

sfs = SFS(lr,
k_features='best',
forward=True,
floating=False,
cv=10)

sfs = sfs.fit(X, y)
assert sfs.k_feature_idx_ == (1, 3, 5, 7, 8, 9, 10, 11, 12)


def test_max_feature_subset_parsimonious():
boston = load_boston()
X, y = boston.data, boston.target
lr = LinearRegression()

sfs = SFS(lr,
k_features='parsimonious',
forward=True,
floating=False,
cv=10)

sfs = sfs.fit(X, y)
assert sfs.k_feature_idx_ == (10, 11, 12, 5)

0 comments on commit c09c23b

Please sign in to comment.