8000 Merge pull request #2369 from larsmans/no-warnings-in-fs · scikit-learn/scikit-learn@9033baf · GitHub
[go: up one dir, main page]

Skip to content

Commit 9033baf

Browse files
committed
Merge pull request #2369 from larsmans/no-warnings-in-fs
[MRG] remove warnings in univariate feature selection
2 parents dcf827e + b46ea71 commit 9033baf

File tree

2 files changed

+16
-9
lines changed

2 files changed

+16
-9
lines changed

sklearn/feature_selection/tests/test_feature_select.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,17 @@ def test_tied_pvalues():
481481
assert_not_in(9998, Xt)
482482

483483

484+
def test_tied_scores():
485+
"""Test for stable sorting in k-best with tied scores."""
486+
X_train = np.array([[0, 0, 0], [1, 1, 1]])
487+
y_train = [0, 1]
488+
489+
for n_features in [1, 2, 3]:
490+
sel = SelectKBest(chi2, k=n_features).fit(X_train, y_train)
491+
X_test = sel.transform([0, 1, 2])
492+
assert_array_equal(X_test[0], np.arange(3)[-n_features:])
493+
494+
484495
def test_nans():
485496
"""Assert that SelectKBest and SelectPercentile can handle NaNs."""
486497
# First feature has zero variance to confuse f_classif (ANOVA) and

sklearn/feature_selection/univariate_selection.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -299,10 +299,6 @@ def fit(self, X, y):
299299
self.scores_, self.pvalues_ = self.score_func(X, y)
300300
self.scores_ = np.asarray(self.scores_)
301301
self.pvalues_ = np.asarray(self.pvalues_)
302-
if len(np.unique(self.pvalues_)) < len(self.pvalues_):
303-
warn("Duplicate p-values. Result may depend on feature ordering."
304-
"There are probably duplicate features, or you used a "
305-
"classification score for a regression task.")
306302
return self
307303

308304

@@ -315,10 +311,6 @@ def fit(self, X, y):
315311
self.scores_, self.pvalues_ = self.score_func(X, y)
316312
self.scores_ = np.asarray(self.scores_)
317313
self.pvalues_ = np.asarray(self.pvalues_)
318-
if len(np.unique(self.scores_)) < len(self.scores_):
319-
warn("Duplicate scores. Result may depend on feature ordering."
320-
"There are probably duplicate features, or you used a "
321-
"classification score for a regression task.")
322314
return self
323315

324316

@@ -428,7 +420,11 @@ def _get_support_mask(self):
428420
# from argsort, which we transform to a mask, which we probably
429421
# transform back to indices later.
430422
mask = np.zeros(scores.shape, dtype=bool)
431-
mask[np.argsort(scores)[-k:]] = 1
423+
424+
# Request a stable sort. Mergesort takes more memory (~40MB per
425+
# megafeature on x86-64), but blows heapsort out of the water in
426+
# terms of speed.
427+
mask[np.argsort(scores, kind="mergesort")[-k:]] = 1
432428
return mask
433429

434430

0 commit comments

Comments
 (0)
0