From 520e107dd1d3d1a7f3c755cfc63a4ef161156615 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Mon, 26 Sep 2016 10:23:57 +0800 Subject: [PATCH 01/14] fix selectFdr bug --- sklearn/feature_selection/univariate_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index d6ecf13974230..008265074f328 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -596,7 +596,7 @@ def _get_support_mask(self): n_features = len(self.pvalues_) sv = np.sort(self.pvalues_) selected = sv[sv <= float(self.alpha) / n_features - * np.arange(n_features)] + * (np.arange(n_features) + 1)] if selected.size == 0: return np.zeros_like(self.pvalues_, dtype=bool) return self.pvalues_ <= selected.max() From 4db45c88a7310471c8cf9aa43dad0db468031af6 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Mon, 26 Sep 2016 23:15:00 +0800 Subject: [PATCH 02/14] more tests --- sklearn/feature_selection/tests/test_feature_select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index d2a73334299d5..c6e49ce30c937 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -404,7 +404,7 @@ def single_fdr(alpha, n_informative, random_state): # FDR = E(FP / (TP + FP)) <= alpha false_discovery_rate = np.mean([single_fdr(alpha, n_informative, random_state) for - random_state in range(30)]) + random_state in range(100)]) assert_greater_equal(alpha, false_discovery_rate) # Make sure that the empirical false discovery rate increases From 3cab18df81c0c3e011218ca86e7d2d1c510cfc0c Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Tue, 27 Sep 2016 12:26:26 +0800 Subject: [PATCH 03/14] change position of operator --- sklearn/feature_selection/univariate_selection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index 0af9b81e0d2b2..627b4cc8b24d9 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -596,8 +596,8 @@ def _get_support_mask(self): n_features = len(self.pvalues_) sv = np.sort(self.pvalues_) - selected = sv[sv <= float(self.alpha) / n_features - * (np.arange(n_features) + 1)] + selected = sv[sv <= float(self.alpha) / n_features * + (np.arange(n_features) + 1)] if selected.size == 0: return np.zeros_like(self.pvalues_, dtype=bool) return self.pvalues_ <= selected.max() From 6cb1cc005e6f54fb93086b63f8b855f3fc3f7482 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Thu, 29 Sep 2016 10:39:51 +0800 Subject: [PATCH 04/14] mimor change --- sklearn/feature_selection/univariate_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index 627b4cc8b24d9..2ab7756df7eeb 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -597,7 +597,7 @@ def _get_support_mask(self): n_features = len(self.pvalues_) sv = np.sort(self.pvalues_) selected = sv[sv <= float(self.alpha) / n_features * - (np.arange(n_features) + 1)] + np.arange(1, n_features + 1)] if selected.size == 0: return np.zeros_like(self.pvalues_, dtype=bool) return self.pvalues_ <= selected.max() From a69d5845c93a844fd74aa1eea42b055f94f18e78 Mon Sep 17 00:00:00 2001 From: Peng Date: Mon, 17 Oct 2016 12:56:50 +0800 Subject: [PATCH 05/14] add test case --- .../tests/test_feature_select.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index c6e49ce30c937..9773bf27f8b32 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -371,6 +371,21 @@ def test_select_heuristics_regression(): assert_less(np.sum(support[5:] == 1), 3) +def test_select_fdr_chi2(): + # Test PR #7490 works + # chi2(X,y) + # scores = array([4., 0.71]), pvalues = array([0.046, 0.398]) + # for the master code + # the result of get_support of the following code is array([False, False]) + # by PR #7490, the result is array([True, False]) + X = np.array([[10,20],[20,20],[20,30]]) + y = np.array([[1],[0],[0]]) + univariate_filter = SelectFdr(chi2, alpha=0.1) + X_r = univariate_filter.fit(X, y).transform(X) + support = univariate_filter.get_support() + assert_array_equal(support, np.array([True, False])) + + def test_select_fdr_regression(): # Test that fdr heuristic actually has low FDR. def single_fdr(alpha, n_informative, random_state): From ac92743f1f0c801524c0c2e9ea1dda6901b4306b Mon Sep 17 00:00:00 2001 From: Peng Date: Mon, 17 Oct 2016 13:16:33 +0800 Subject: [PATCH 06/14] remove unused variable --- sklearn/feature_selection/tests/test_feature_select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 9773bf27f8b32..9b4c88840521c 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -381,7 +381,7 @@ def test_select_fdr_chi2(): X = np.array([[10,20],[20,20],[20,30]]) y = np.array([[1],[0],[0]]) univariate_filter = SelectFdr(chi2, alpha=0.1) - X_r = univariate_filter.fit(X, y).transform(X) + univariate_filter.fit(X, y).transform(X) support = univariate_filter.get_support() assert_array_equal(support, np.array([True, False])) From a5a9dcf6d9d785c3b60171baa031b2d642b9559b Mon Sep 17 00:00:00 2001 From: Peng Date: Mon, 17 Oct 2016 13:49:08 +0800 Subject: [PATCH 07/14] fix flake8 error --- sklearn/feature_selection/tests/test_feature_select.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 9b4c88840521c..20e674d4ba23a 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -378,8 +378,8 @@ def test_select_fdr_chi2(): # for the master code # the result of get_support of the following code is array([False, False]) # by PR #7490, the result is array([True, False]) - X = np.array([[10,20],[20,20],[20,30]]) - y = np.array([[1],[0],[0]]) + X = np.array([[10, 20], [20, 20], [20, 30]]) + y = np.array([[1], [0], [0]]) univariate_filter = SelectFdr(chi2, alpha=0.1) univariate_filter.fit(X, y).transform(X) support = univariate_filter.get_support() From 207d15f2b16270c32235d761e83fbb44d5628de8 Mon Sep 17 00:00:00 2001 From: Peng Date: Wed, 19 Oct 2016 09:34:50 +0800 Subject: [PATCH 08/14] add more test case --- .../tests/test_feature_select.py | 37 ++++++++++++++++--- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 20e674d4ba23a..be4a246262f8b 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -371,19 +371,44 @@ def test_select_heuristics_regression(): assert_less(np.sum(support[5:] == 1), 3) -def test_select_fdr_chi2(): +def test_select_artificial_data_ch2(): # Test PR #7490 works # chi2(X,y) # scores = array([4., 0.71]), pvalues = array([0.046, 0.398]) # for the master code - # the result of get_support of the following code is array([False, False]) + # the result of get_support of SelectFdr is array([False, False]) # by PR #7490, the result is array([True, False]) + # SelectKBest, SelectPercentile, SelectFpr, SelectFwe also tested X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) - univariate_filter = SelectFdr(chi2, alpha=0.1) - univariate_filter.fit(X, y).transform(X) - support = univariate_filter.get_support() - assert_array_equal(support, np.array([True, False])) + scores, pvalues = chi2(X, y) + assert_array_almost_equal(scores, np.array([4., 0.71428571])) + assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) + + filter_fdr = SelectFdr(chi2, alpha=0.1) + filter_fdr.fit(X, y).transform(X) + support_fdr = filter_fdr.get_support() + assert_array_equal(support_fdr, np.array([True, False])) + + filter_kbest = SelectKBest(chi2, k=1) + filter_kbest.fit(X, y).transform(X) + support_kbest = filter_kbest.get_support() + assert_array_equal(support_kbest, np.array([True, False])) + + filter_percentile = SelectPercentile(chi2, percentile=50) + filter_percentile.fit(X, y).transform(X) + support_percentile = filter_percentile.get_support() + assert_array_equal(support_percentile, np.array([True, False])) + + filter_fpr = SelectFpr(chi2, alpha=0.1) + filter_fpr.fit(X, y).transform(X) + support_fpr = filter_fpr.get_support() + assert_array_equal(support_fpr, np.array([True, False])) + + filter_fwe = SelectFwe(chi2, alpha=0.1) + filter_fwe.fit(X, y).transform(X) + support_fwe = filter_fwe.get_support() + assert_array_equal(support_fwe, np.array([True, False])) def test_select_fdr_regression(): From b315ad98ac5d088a9f14fbfd261fba1afdd02960 Mon Sep 17 00:00:00 2001 From: Peng Date: Wed, 19 Oct 2016 13:14:58 +0800 Subject: [PATCH 09/14] change function name --- .../tests/test_feature_select.py | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index be4a246262f8b..ead5d8d572d7e 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -371,14 +371,8 @@ def test_select_heuristics_regression(): assert_less(np.sum(support[5:] == 1), 3) -def test_select_artificial_data_ch2(): - # Test PR #7490 works - # chi2(X,y) - # scores = array([4., 0.71]), pvalues = array([0.046, 0.398]) - # for the master code - # the result of get_support of SelectFdr is array([False, False]) - # by PR #7490, the result is array([True, False]) - # SelectKBest, SelectPercentile, SelectFpr, SelectFwe also tested +def test_boundary_case_ch2(): + # Test boundary case, and always aiming to select 1 feature X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) @@ -386,27 +380,27 @@ def test_select_artificial_data_ch2(): assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) - filter_fdr.fit(X, y).transform(X) + filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) - filter_kbest.fit(X, y).transform(X) + filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) - filter_percentile.fit(X, y).transform(X) + filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) - filter_fpr.fit(X, y).transform(X) + filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) - filter_fwe.fit(X, y).transform(X) + filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False])) From 5a53fa729aa77a805eca136f0f2bd8cd2b25dddb Mon Sep 17 00:00:00 2001 From: Peng Date: Wed, 19 Oct 2016 14:13:26 +0800 Subject: [PATCH 10/14] add an entry in what_new --- doc/whats_new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index a4b775ec66d0a..aca2c63b0163b 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -44,6 +44,10 @@ Enhancements Bug fixes ......... + - :class:`sklearn.feature_selection.SelectFdr` now correctly works + (`#7490 `_) by + `Peng Meng`_. + - :class:`sklearn.manifold.LocallyLinearEmbedding` now correctly handles integer inputs (`#6282 `_) by From f61098ff0a68ae2b080ff52cdc93a6d1929c8dd3 Mon Sep 17 00:00:00 2001 From: Peng Date: Wed, 19 Oct 2016 20:16:43 +0800 Subject: [PATCH 11/14] change entry in whats_new --- doc/whats_new.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index aca2c63b0163b..9af7d71e3a8f9 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -44,7 +44,8 @@ Enhancements Bug fixes ......... - - :class:`sklearn.feature_selection.SelectFdr` now correctly works + - Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not + exactly implement Benjamini-Hochberg procedure (`#7490 `_) by `Peng Meng`_. From 9e76fa5f3806c66f83df762c3f01748c223874a4 Mon Sep 17 00:00:00 2001 From: Peng Date: Wed, 19 Oct 2016 20:32:48 +0800 Subject: [PATCH 12/14] change entry in whats_new --- doc/whats_new.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 9af7d71e3a8f9..5179c86f82ac9 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -45,7 +45,8 @@ Bug fixes ......... - Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not - exactly implement Benjamini-Hochberg procedure + exactly implement Benjamini-Hochberg procedure. It formerly may have + selected fewer features than it should. (`#7490 `_) by `Peng Meng`_. From 22de1f81c652470aa41fc15c9d96dd3372afe9dd Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Wed, 19 Oct 2016 23:58:52 +0800 Subject: [PATCH 13/14] minor change --- sklearn/feature_selection/tests/test_feature_select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index ead5d8d572d7e..842147c3dd870 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -372,7 +372,7 @@ def test_select_heuristics_regression(): def test_boundary_case_ch2(): - # Test boundary case, and always aiming to select 1 feature + # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) From d2be567b844b5c75a51b7ae56348154c7d820339 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Thu, 20 Oct 2016 09:56:16 +0800 Subject: [PATCH 14/14] add name as link --- doc/whats_new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 5179c86f82ac9..ff8c748716a54 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -4867,3 +4867,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Utkarsh Upadhyay: https://github.com/musically-ut .. _Eugene Chen: https://github.com/eyc88 + +.. _Peng Meng: https://github.com/mpjlu