From d0057294b66a27f5bdb7946e897d67ce0190be89 Mon Sep 17 00:00:00 2001 From: qinhanmin2014 Date: Sun, 11 Jun 2017 09:57:21 +0800 Subject: [PATCH 1/9] fix pca explained_variance_ --- sklearn/decomposition/pca.py | 6 +++--- sklearn/decomposition/tests/test_pca.py | 21 ++++++++++++++++++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 9781efd57c71b..a4140de2d25b6 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -416,7 +416,7 @@ def _fit_full(self, X, n_components): components_ = V # Get variance explained by singular values - explained_variance_ = (S ** 2) / n_samples + explained_variance_ = (S ** 2) / (n_samples - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var singular_values_ = S.copy() # Store the singular values. @@ -495,8 +495,8 @@ def _fit_truncated(self, X, n_components, svd_solver): self.n_components_ = n_components # Get variance explained by singular values - self.explained_variance_ = (S ** 2) / n_samples - total_var = np.var(X, axis=0) + self.explained_variance_ = (S ** 2) / (n_samples - 1) + total_var = np.var(X, ddof=1, axis=0) self.explained_variance_ratio_ = \ self.explained_variance_ / total_var.sum() self.singular_values_ = S.copy() # Store the singular values. diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index bb94ee100ae1a..0a7086012e10c 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -215,16 +215,31 @@ def test_explained_variance(): # compare to empirical variances X_pca = pca.transform(X) assert_array_almost_equal(pca.explained_variance_, - np.var(X_pca, axis=0)) + np.var(X_pca, ddof=1, axis=0)) X_pca = apca.transform(X) assert_array_almost_equal(apca.explained_variance_, - np.var(X_pca, axis=0)) + np.var(X_pca, ddof=1, axis=0)) X_rpca = rpca.transform(X) - assert_array_almost_equal(rpca.explained_variance_, np.var(X_rpca, axis=0), + assert_array_almost_equal(rpca.explained_variance_, + np.var(X_rpca, ddof=1, axis=0), decimal=1) + # Another way to run this part (according to the original definition) + # compare to empirical variances + # expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0] + # expected_result = sorted(expected_result, reverse=True)[:2] + # X_pca = pca.transform(X) + # assert_array_almost_equal(pca.explained_variance_, expected_result) + + # X_pca = apca.transform(X) + # assert_array_almost_equal(apca.explained_variance_, expected_result) + + # X_rpca = rpca.transform(X) + # assert_array_almost_equal(rpca.explained_variance_, + # expected_result, decimal=1) + # Same with correlated data X = datasets.make_classification(n_samples, n_features, n_informative=n_features-2, From bda1de1a77bc9dc380103aa8d3cb2b015710eac0 Mon Sep 17 00:00:00 2001 From: qinhanmin2014 Date: Sun, 11 Jun 2017 12:09:10 +0800 Subject: [PATCH 2/9] fix fit_transform --- sklearn/decomposition/pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index a4140de2d25b6..8fb5aba4ce372 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -346,7 +346,7 @@ def fit_transform(self, X, y=None): if self.whiten: # X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples) - U *= sqrt(X.shape[0]) + U *= sqrt(X.shape[0] - 1) else: # X_new = X * V = U * S * V^T * V = U * S U *= S[:self.n_components_] From 0a46cb3129913577e21f85b402e3a4038d9223e3 Mon Sep 17 00:00:00 2001 From: qinhanmin2014 Date: Sun, 11 Jun 2017 12:12:14 +0800 Subject: [PATCH 3/9] fix test_whitening --- sklearn/decomposition/tests/test_pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 0a7086012e10c..e9239f6ec68ee 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -174,7 +174,8 @@ def test_whitening(): X_whitened2 = pca.transform(X_) assert_array_almost_equal(X_whitened, X_whitened2) - assert_almost_equal(X_whitened.std(axis=0), np.ones(n_components), + assert_almost_equal(X_whitened.std(ddof=1, axis=0), + np.ones(n_components), decimal=6) assert_almost_equal(X_whitened.mean(axis=0), np.zeros(n_components)) From d63530126e7db22af8f3a070b4c3641b86a7331c Mon Sep 17 00:00:00 2001 From: qinhanmin2014 Date: Sun, 11 Jun 2017 15:57:35 +0800 Subject: [PATCH 4/9] fix IncrementalPCA --- sklearn/decomposition/incremental_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index e1806d1ef7616..9b23d1f16e1fd 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -251,7 +251,7 @@ def partial_fit(self, X, y=None, check_input=True): U, S, V = linalg.svd(X, full_matrices=False) U, V = svd_flip(U, V, u_based_decision=False) - explained_variance = S ** 2 / n_total_samples + explained_variance = S ** 2 / (n_total_samples - 1) explained_variance_ratio = S ** 2 / np.sum(col_var * n_total_samples) self.n_samples_seen_ = n_total_samples From 5d6f2661bc4f23409b77c06a2d57db790405c9e6 Mon Sep 17 00:00:00 2001 From: qinhanmin2014 Date: Mon, 12 Jun 2017 16:47:30 +0800 Subject: [PATCH 5/9] uncomment the test --- sklearn/decomposition/tests/test_pca.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index e9239f6ec68ee..e90afc4c3466b 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -228,18 +228,17 @@ def test_explained_variance(): decimal=1) # Another way to run this part (according to the original definition) - # compare to empirical variances - # expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0] - # expected_result = sorted(expected_result, reverse=True)[:2] - # X_pca = pca.transform(X) - # assert_array_almost_equal(pca.explained_variance_, expected_result) + expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0] + expected_result = sorted(expected_result, reverse=True)[:2] + X_pca = pca.transform(X) + assert_array_almost_equal(pca.explained_variance_, expected_result) - # X_pca = apca.transform(X) - # assert_array_almost_equal(apca.explained_variance_, expected_result) + X_pca = apca.transform(X) + assert_array_almost_equal(apca.explained_variance_, expected_result) - # X_rpca = rpca.transform(X) - # assert_array_almost_equal(rpca.explained_variance_, - # expected_result, decimal=1) + X_rpca = rpca.transform(X) + assert_array_almost_equal(rpca.explained_variance_, + expected_result, decimal=1) # Same with correlated data X = datasets.make_classification(n_samples, n_features, From 71573fcb6ebc35b22a751d0ed7669070ecc784a4 Mon Sep 17 00:00:00 2001 From: qinhanmin2014 Date: Mon, 12 Jun 2017 17:19:54 +0800 Subject: [PATCH 6/9] improve test --- sklearn/decomposition/tests/test_pca.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index e90afc4c3466b..53dcb481334b9 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -214,29 +214,23 @@ def test_explained_variance(): rpca.explained_variance_ratio_, 1) # compare to empirical variances + expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0] + expected_result = sorted(expected_result, reverse=True)[:2] + X_pca = pca.transform(X) assert_array_almost_equal(pca.explained_variance_, np.var(X_pca, ddof=1, axis=0)) + assert_array_almost_equal(pca.explained_variance_, expected_result) X_pca = apca.transform(X) assert_array_almost_equal(apca.explained_variance_, np.var(X_pca, ddof=1, axis=0)) + assert_array_almost_equal(apca.explained_variance_, expected_result) X_rpca = rpca.transform(X) assert_array_almost_equal(rpca.explained_variance_, np.var(X_rpca, ddof=1, axis=0), decimal=1) - - # Another way to run this part (according to the original definition) - expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0] - expected_result = sorted(expected_result, reverse=True)[:2] - X_pca = pca.transform(X) - assert_array_almost_equal(pca.explained_variance_, expected_result) - - X_pca = apca.transform(X) - assert_array_almost_equal(apca.explained_variance_, expected_result) - - X_rpca = rpca.transform(X) assert_array_almost_equal(rpca.explained_variance_, expected_result, decimal=1) From 8d13fcb0d3998455e63ff4c9fe454bea2ef672a0 Mon Sep 17 00:00:00 2001 From: qinhanmin2014 Date: Mon, 19 Jun 2017 22:14:56 +0800 Subject: [PATCH 7/9] make CI green --- sklearn/decomposition/pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 8fb5aba4ce372..c460f1e8ccfef 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -714,8 +714,8 @@ def _fit(self, X): n_iter=self.iterated_power, random_state=random_state) - self.explained_variance_ = exp_var = (S ** 2) / n_samples - full_var = np.var(X, axis=0).sum() + self.explained_variance_ = exp_var = (S ** 2) / (n_samples - 1) + full_var = np.var(X, ddof=1, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var self.singular_values_ = S # Store the singular values. From 6496f5f3872bc6e6e5492a5e2f3864524305ce9c Mon Sep 17 00:00:00 2001 From: qinhanmin2014 Date: Wed, 21 Jun 2017 07:12:41 +0800 Subject: [PATCH 8/9] revert #7843 and add what's new --- doc/whats_new.rst | 6 ++++++ sklearn/decomposition/pca.py | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index d63b4d6115f30..05200f49358ed 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -372,6 +372,12 @@ Bug fixes - Add ``shuffle`` parameter to :func:`model_selection.train_test_split`. :issue:`#8845` by :user:`themrmax ` + - Fixed the implementation of explained_variance_ + in :class:`decomposition.PCA`, + :class:`decomposition.RandomizedPCA` and + :class:`decomposition.IncrementalPCA`. + :issue:`#9105` by `Hanmin Qin `_. + API changes summary ------------------- diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index c460f1e8ccfef..6790c658bf7ef 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -285,12 +285,6 @@ class PCA(_BasePCA): >>> print(pca.singular_values_) # doctest: +ELLIPSIS [ 6.30061...] - Notes - ----- - PCA uses the maximum likelihood estimate of the eigenvalues, which does not - include the Bessel correction, though in practice this should rarely make a - difference in a machine learning context. - See also -------- KernelPCA From de70e9bd02a35c17c34c704d651fd4dfe1fae7e0 Mon Sep 17 00:00:00 2001 From: qinhanmin2014 Date: Wed, 21 Jun 2017 07:35:00 +0800 Subject: [PATCH 9/9] fix what's new --- doc/whats_new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 05200f49358ed..d1a8bd1d3ef6e 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -372,11 +372,11 @@ Bug fixes - Add ``shuffle`` parameter to :func:`model_selection.train_test_split`. :issue:`#8845` by :user:`themrmax ` - - Fixed the implementation of explained_variance_ + - Fixed the implementation of `explained_variance_` in :class:`decomposition.PCA`, :class:`decomposition.RandomizedPCA` and :class:`decomposition.IncrementalPCA`. - :issue:`#9105` by `Hanmin Qin `_. + :issue:`9105` by `Hanmin Qin `_. API changes summary -------------------