8000 [MRG+2] Incorrect implementation of explained_variance_ in PCA (#9105) · scikit-learn/scikit-learn@2a36ff1 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2a36ff1

Browse files
qinhanmin2014amueller
authored andcommitted
[MRG+2] Incorrect implementation of explained_variance_ in PCA (#9105)
* fix pca explained_variance_ * fix fit_transform * fix test_whitening * fix IncrementalPCA * uncomment the test * improve test * make CI green * revert #7843 and add what's new * fix what's new
1 parent c2a42ef commit 2a36ff1

File tree

4 files changed

+26
-17
lines changed

4 files changed

+26
-17
lines changed

doc/whats_new.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,12 @@ Bug fixes
413413
by `Joel Nothman`_ and :user:`Jon Crall <Erotemic>`.
414414

415415

416+
- Fixed the implementation of `explained_variance_`
417+
in :class:`decomposition.PCA`,
418+
:class:`decomposition.RandomizedPCA` and
419+
:class:`decomposition.IncrementalPCA`.
420+
:issue:`9105` by `Hanmin Qin <https://github.com/qinhanmin2014>`_.
421+
416422
API changes summary
417423
-------------------
418424

sklearn/decomposition/incremental_pca.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ def partial_fit(self, X, y=None, check_input=True):
251251

252252
U, S, V = linalg.svd(X, full_matrices=False)
253253
U, V = svd_flip(U, V, u_based_decision=False)
254-
explained_variance = S ** 2 / n_total_samples
254+
explained_variance = S ** 2 / (n_total_samples - 1)
255255
explained_variance_ratio = S ** 2 / np.sum(col_var * n_total_samples)
256256

257257
self.n_samples_seen_ = n_total_samples

sklearn/decomposition/pca.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -285,12 +285,6 @@ class PCA(_BasePCA):
285285
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
286286
[ 6.30061...]
287287
288-
Notes
289-
-----
290-
PCA uses the maximum likelihood estimate of the eigenvalues, which does not
291-
include the Bessel correction, though in practice this should rarely make a
292-
difference in a machine learning context.
293-
294288
See also
295289
--------
296290
KernelPCA
@@ -346,7 +340,7 @@ def fit_transform(self, X, y=None):
346340

347341
if self.whiten:
348342
# X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples)
349-
U *= sqrt(X.shape[0])
343+
U *= sqrt(X.shape[0] - 1)
350344
else:
351345
# X_new = X * V = U * S * V^T * V = U * S
352346
U *= S[:self.n_components_]
@@ -416,7 +410,7 @@ def _fit_full(self, X, n_components):
416410
components_ = V
417411

418412
# Get variance explained by singular values
419-
explained_variance_ = (S ** 2) / n_samples
413+
explained_variance_ = (S ** 2) / (n_samples - 1)
420414
total_var = explained_variance_.sum()
421415
explained_variance_ratio_ = explained_variance_ / total_var
422416
singular_values_ = S.copy() # Store the singular values.
@@ -495,8 +489,8 @@ def _fit_truncated(self, X, n_components, svd_solver):
495489
self.n_components_ = n_components
496490

497491
# Get variance explained by singular values
498-
self.explained_variance_ = (S ** 2) / n_samples
499-
total_var = np.var(X, axis=0)
492+
self.explained_variance_ = (S ** 2) / (n_samples - 1)
493+
total_var = np.var(X, ddof=1, axis=0)
500494
self.explained_variance_ratio_ = \
501495
self.explained_variance_ / total_var.sum()
502496
self.singular_values_ = S.copy() # Store the singular values.
@@ -714,8 +708,8 @@ def _fit(self, X):
714708
n_iter=self.iterated_power,
715709
random_state=random_state)
716710

717-
self.explained_variance_ = exp_var = (S ** 2) / n_samples
718-
full_var = np.var(X, axis=0).sum()
711+
self.explained_variance_ = exp_var = (S ** 2) / (n_samples - 1)
712+
full_var = np.var(X, ddof=1, axis=0).sum()
719713
self.explained_variance_ratio_ = exp_var / full_var
720714
self.singular_values_ = S # Store the singular values.
721715

sklearn/decomposition/tests/test_pca.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,8 @@ def test_whitening():
174174
X_whitened2 = pca.transform(X_)
175175
assert_array_almost_equal(X_whitened, X_whitened2)
176176

177-
assert_almost_equal(X_whitened.std(axis=0), np.ones(n_components),
177+
assert_almost_equal(X_whitened.std(ddof=1, axis=0),
178+
np.ones(n_components),
178179
decimal=6)
179180
assert_almost_equal(X_whitened.mean(axis=0), np.zeros(n_components))
180181

@@ -213,17 +214,25 @@ def test_explained_variance():
213214
rpca.explained_variance_ratio_, 1)
214215

215216
# compare to empirical variances
217+
expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0]
218+
expected_result = sorted(expected_result, reverse=True)[:2]
219+
216220
X_pca = pca.transform(X)
217221
assert_array_almost_equal(pca.explained_variance_,
218-
np.var(X_pca, axis=0))
222+
np.var(X_pca, ddof=1, axis=0))
223+
assert_array_almost_equal(pca.explained_variance_, expected_result)
219224

220225
X_pca = apca.transform(X)
221226< 8941 div class="diff-text-inner"> assert_array_almost_equal(apca.explained_variance_,
222-
np.var(X_pca, axis=0))
227+
np.var(X_pca, ddof=1, axis=0))
228+
assert_array_almost_equal(apca.explained_variance_, expected_result)
223229

224230
X_rpca = rpca.transform(X)
225-
assert_array_almost_equal(rpca.explained_variance_, np.var(X_rpca, axis=0),
231+
assert_array_almost_equal(rpca.explained_variance_,
232+
np.var(X_rpca, ddof=1, axis=0),
226233
decimal=1)
234+
assert_array_almost_equal(rpca.explained_variance_,
235+
expected_result, decimal=1)
227236

228237
# Same with correlated data
229238
X = datasets.make_classification(n_samples, n_features,

0 commit comments

Comments
 (0)
0