8000 support for scorers that return nan; bugfix and additional unittest t… · scikit-learn/scikit-learn@02f002f · GitHub
[go: up one dir, main page]

Skip to content

Commit 02f002f

Browse files
committed
support for scorers that return nan; bugfix and additional unittest to select features in a specified range
1 parent 702c197 commit 02f002f

File tree

2 files changed

+29
-6
lines changed

2 files changed

+29
-6
lines changed

sklearn/feature_selection/sequential_feature_selector.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def fit(self, X, y):
183183

184184
if self.n_features_to_select[0] > self.n_features_to_select[1]:
185185
raise ValueError('The min n_features_to_select value must be'
186-
' larger than the max'
186+
' smaller than the max'
187187
' n_features_to_select value.')
188188

189189
if isinstance(self.n_features_to_select, tuple):
@@ -211,7 +211,7 @@ def fit(self, X, y):
211211
self.subsets_[k] = {
212212
'feature_subset_idx': k_idx,
213213
'cv_scores': k_score,
214-
'avg_score': k_score.mean()
214+
'avg_score': np.nanmean(k_score)
215215
}
216216

217217
best_subset = None
@@ -245,6 +245,8 @@ def fit(self, X, y):
245245
if select_in_range:
246246
max_score = float('-inf')
247247
for k in self.subsets_:
248+
if k < self.n_features_to_select[0] or k > self.n_features_to_select[1]:
249+
continue
248250
if self.subsets_[k]['avg_score'] > max_score:
249251
max_score = self.subsets_[k]['avg_score']
250252
best_subset = k
@@ -279,7 +281,7 @@ def _inclusion(self, orig_set, subset, X, y):
279281
for feature in remaining:
280282
new_subset = tuple(subset | {feature})
281283
cv_scores = self._calc_score(X, y, new_subset)
282-
all_avg_scores.append(cv_scores.mean())
284+
all_avg_scores.append(np.nanmean(cv_scores))
283285
all_cv_scores.append(cv_scores)
284286
all_subsets.append(new_subset)
285287
best = np.argmax(all_avg_scores)
@@ -299,7 +301,7 @@ def _exclusion(self, feature_set, X, y, fixed_feature=None):
299301
if fixed_feature and fixed_feature not in set(p):
300302
continue
301303
cv_scores = self._calc_score(X, y, p)
302-
all_avg_scores.append(cv_scores.mean())
304+
all_avg_scores.append(np.nanmean(cv_scores))
303305
all_cv_scores.append(cv_scores)
304306
all_subsets.append(p)
305307
best = np.argmax(all_avg_scores)

sklearn/feature_selection/tests/test_sequential_feature_selector.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def test_kfeatures_type_5():
9393
y = iris.target
9494
knn = KNeighborsClassifier()
9595
expect = ('he min n_features_to_select value must be'
96-
' larger than the max n_features_to_select value.')
96+
' smaller than the max n_features_to_select value.')
9797
sfs = SFS(estimator=knn,
9898
n_features_to_select=(3, 1))
9999
assert_raise_message(ValueError, expect, sfs.fit, X, y)
@@ -221,7 +221,10 @@ def test_regression():
221221
assert round(sfs_r.score_, 4) == 0.2001
222222

223223

224-
def test_regression_in_tuplerange():
224+
def test_regression_in_tuplerange_forward():
225+
"""Test if selected features are within
226+
the selected tuple range when running forward selection
227+
"""
225228
boston = load_boston()
226229
X, y = boston.data, boston.target
227230
lr = LinearRegression()
@@ -234,6 +237,24 @@ def test_regression_in_tuplerange():
234237
assert round(sfs_r.score_, 4) == 0.2991, sfs_r.score_
235238

236239

240+
def test_regression_in_tuplerange_backward():
241+
"""Test if selected features are within
242+
the selected tuple range when running backward selection
243+
"""
244+
boston = load_boston()
245+
X, y = boston.data, boston.target
246+
lr = LinearRegression()
247+
248+
sfs_r = SFS(lr,
249+
n_features_to_select=(1, 5),
250+
forward=False,
251+
scoring='neg_mean_squared_error',
252+
cv=10)
253+
254+
sfs_r = sfs_r.fit(X, y)
255+
assert len(sfs_r.feature_subset_idx_) == 5
256+
257+
237258
def test_transform_not_fitted():
238259
iris = load_iris()
239260
X = iris.data

0 commit comments

Comments
 (0)
0