scikit-learn
diff --git a/‎sklearn/decomposition/pca.py
Lines changed: 28 additions & 17 deletions b/‎sklearn/decomposition/pca.py
Lines changed: 28 additions & 17 deletions
diff --git a/‎sklearn/decomposition/tests/test_pca.py
Lines changed: 48 additions & 5 deletions b/‎sklearn/decomposition/tests/test_pca.py
Lines changed: 48 additions & 5 deletions
diff --git a/‎sklearn/utils/extmath.py
Lines changed: 45 additions & 10 deletions b/‎sklearn/utils/extmath.py
Lines changed: 45 additions & 10 deletions
@@ -27,6 +27,7 @@
 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
+from ..utils.sparsefuncs import mean_variance_axis
 
 
 def _assess_dimension_(spectrum, rank, n_samples, n_features):
@@ -370,14 +371,8 @@ def fit_transform(self, X, y=None):
 
     def _fit(self, X):
         """Dispatch to the right submethod depending on the chosen solver."""
-
-        # Raise an error for sparse input.
-        # This is more informative than the generic one raised by check_array.
-        if issparse(X):
-            raise TypeError('PCA does not support sparse input. See '
-                            'TruncatedSVD for a possible alternative.')
-
-        X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True,
+        X = check_array(X, accept_sparse=['csr', 'csc'],
+                        dtype=[np.float64, np.float32], ensure_2d=True,
                         copy=self.copy)
 
         # Handle n_components==None
@@ -392,15 +387,24 @@ def _fit(self, X):
         # Handle svd_solver
         self._fit_svd_solver = self.svd_solver
         if self._fit_svd_solver == 'auto':
+            # Sparse data can only be handled with the randomized solver
+            if issparse(X):
+                self._fit_svd_solver = 'randomized'
             # Small problem or n_components == 'mle', just call full PCA
-            if max(X.shape) <= 500 or n_components == 'mle':
+            elif max(X.shape) <= 500 or n_components == 'mle':
                 self._fit_svd_solver = 'full'
             elif n_components >= 1 and n_components < .8 * min(X.shape):
                 self._fit_svd_solver = 'randomized'
             # This is also the case of n_components in (0,1)
             else:
                 self._fit_svd_solver = 'full'
 
+        # Ensure we don't try call arpack or full on a sparse matrix
+        if issparse(X) and self._fit_svd_solver != 'randomized':
+            raise ValueError(
+                'only the randomized solver supports sparse matrices'
+            )
+
         # Call different fits for either full or truncated SVD
         if self._fit_svd_solver == 'full':
             return self._fit_full(X, n_components)
@@ -503,11 +507,15 @@ def _fit_truncated(self, X, n_components, svd_solver):
 
         random_state = check_random_state(self.random_state)
 
-        # Center data
-        self.mean_ = np.mean(X, axis=0)
-        X -= self.mean_
+        if issparse(X):
+            self.mean_, total_var = mean_variance_axis(X, axis=0, ddof=1)
+        else:
+            self.mean_ = np.mean(X, axis=0)
+            total_var = np.var(X, axis=0, ddof=1)
 
         if svd_solver == 'arpack':
+            # Center data
+            X -= self.mean_
             # random init solution, as ARPACK does it internally
             v0 = random_state.uniform(-1, 1, size=min(X.shape))
             U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0)
@@ -519,18 +527,21 @@ def _fit_truncated(self, X, n_components, svd_solver):
 
         elif svd_solver == 'randomized':
             # sign flipping is done inside
-            U, S, V = randomized_svd(X, n_components=n_components,
-                                     n_iter=self.iterated_power,
-                                     flip_sign=True,
-                                     random_state=random_state)
+            U, S, V = randomized_svd(
+                X,
+                n_components=n_components,
+                n_iter=self.iterated_power,
+                flip_sign=True,
+                subtract_mean=True,
+                random_state=random_state,
+            )
 
         self.n_samples_, self.n_features_ = n_samples, n_features
         self.components_ = V
         self.n_components_ = n_components
 
         # Get variance explained by singular values
         self.explained_variance_ = (S ** 2) / (n_samples - 1)
-        total_var = np.var(X, ddof=1, axis=0)
         self.explained_variance_ratio_ = \
             self.explained_variance_ / total_var.sum()
         self.singular_values_ = S.copy()  # Store the singular values.
 
@@ -307,7 +307,7 @@ def test_singular_values():
     rpca.fit(X_hat)
     assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
     assert_array_almost_equal(apca.singular_values_, [3.142, 2.718, 1.0], 14)
-    assert_array_almost_equal(rpca.singular_values_, [3.142, 2.718, 1.0], 14)
+    assert_array_almost_equal(rpca.singular_values_, [3.142, 2.718, 1.0], 2)
 
 
 def test_pca_check_projection():
@@ -683,15 +683,58 @@ def test_svd_solver_auto():
     assert_array_almost_equal(pca.components_, pca_test.components_)
 
 
-@pytest.mark.parametrize('svd_solver', solver_list)
-def test_pca_sparse_input(svd_solver):
+def test_pca_sparse_input_randomized_solver():
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    n_features = 80
+
+    # The randomized method produces larger errors whenever the means of the
+    # matrix are way off the origin
+    X = rng.normal(1000, 20, (n_samples, n_features))
+
+    X_sp = sp.sparse.csr_matrix(X)
+    assert sp.sparse.issparse(X_sp)
+
+    # Compute the complete decomposition on the dense matrix
+    pca = PCA(n_components=3, svd_solver='full', random_state=rng).fit(X)
+    # And compute a randomized decomposition on the sparse matrix. Increase the
+    # number of power iterations to account for the non-zero means
+    pca_sp = PCA(
+        n_components=3,
+        svd_solver='randomized',
+        random_state=rng,
+        iterated_power=20,
+    ).fit(X_sp)
+
+    # Ensure the singular values are close to the exact singular values
+    assert_array_almost_equal(pca_sp.singular_values_, pca.singular_values_, 5)
+
+    # Ensure that the basis is close to the true basis
+    X_pca = pca.transform(X)
+    X_sppca = pca_sp.transform(X)
+    assert_array_almost_equal(X_sppca, X_pca, 2)
+
+
+@pytest.mark.parametrize('svd_solver', ['full', 'arpack'])
+def test_pca_sparse_input_bad_solvers(svd_solver):
     X = np.random.RandomState(0).rand(5, 4)
     X = sp.sparse.csr_matrix(X)
-    assert(sp.sparse.issparse(X))
+    assert sp.sparse.issparse(X)
 
     pca = PCA(n_components=3, svd_solver=svd_solver)
 
-    assert_raises(TypeError, pca.fit, X)
+    assert_raises(ValueError, pca.fit, X)
+
+
+def test_pca_auto_solver_selects_randomized_solver_for_sparse_matrices():
+    X = np.random.RandomState(0).rand(5, 4)
+    X = sp.sparse.csr_matrix(X)
+    assert sp.sparse.issparse(X)
+
+    pca = PCA(n_components=3, svd_solver='auto')
+    pca.fit(X)
+
+    assert_equal(pca._fit_svd_solver, 'randomized')
 
 
 def test_pca_bad_solver():
 
@@ -147,6 +147,7 @@ def safe_sparse_dot(a, b, dense_output=False):
 
 def randomized_range_finder(A, size, n_iter,
                             power_iteration_normalizer='auto',
+                            subtract_mean=False,
                             random_state=None):
     """Computes an orthonormal matrix whose range approximates the range of A.
 
@@ -171,6 +172,13 @@ def randomized_range_finder(A, size, n_iter,
 
         .. versionadded:: 0.18
 
+    subtract_mean : bool
+        Whether the mean  of `A` should be subtracted after each multiplication
+        by the `A` matrix. This is equivalent to multiplying matrices by a
+        centered `A` without ever having to explicitly center. This is
+        especially useful for performing PCA on large sparse matrices, so they
+        do not need to be centered.
+
     random_state : int, RandomState instance or None, optional (default=None)
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
@@ -211,28 +219,39 @@ def randomized_range_finder(A, size, n_iter,
         else:
             power_iteration_normalizer = 'LU'
 
+    if subtract_mean:
+        c = np.mean(A, axis=0).reshape((1, -1))
+        applyA = lambda X: safe_sparse_dot(A, X) - safe_sparse_dot(c, X)
+        applyAT = lambda X: safe_sparse_dot(A.T, X) - \
+                            safe_sparse_dot(c.T, Q.sum(axis=0).reshape((1, -1)))
+    else:
+        applyA = lambda X: safe_sparse_dot(A, X)
+        applyAT = lambda X: safe_sparse_dot(A.T, X)
+
+    Q = applyA(Q)
+
     # Perform power iterations with Q to further 'imprint' the top
     # singular vectors of A in Q
     for i in range(n_iter):
         if power_iteration_normalizer == 'none':
-            Q = safe_sparse_dot(A, Q)
-            Q = safe_sparse_dot(A.T, Q)
+            Q = applyAT(Q)
+            Q = applyA(Q)
         elif power_iteration_normalizer == 'LU':
-            Q, _ = linalg.lu(safe_sparse_dot(A, Q), permute_l=True)
+            Q, _ = linalg.lu(applyAT(Q), permute_l=True)
+            Q, _ = linalg.lu(applyA(Q), permute_l=True)
         elif power_iteration_normalizer == 'QR':
-            Q, _ = linalg.qr(safe_sparse_dot(A, Q), mode='economic')
-            Q, _ = linalg.qr(safe_sparse_dot(A.T, Q), mode='economic')
+            Q, _ = linalg.qr(applyAT(Q), mode='economic')
+            Q, _ = linalg.qr(applyA(Q), mode='economic')
 
     # Sample the range of A using by linear projection of Q
     # Extract an orthonormal basis
-    Q, _ = linalg.qr(safe_sparse_dot(A, Q), mode='economic')
+    Q, _ = linalg.qr(Q, mode='economic')
     return Q
 
 
 def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
                    power_iteration_normalizer='auto', transpose='auto',
-                   flip_sign=True, random_state=0):
+                   flip_sign=True, subtract_mean=False, random_state=0):
     """Computes a truncated randomized SVD
 
     Parameters
@@ -283,6 +302,13 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
         set to `True`, the sign ambiguity is resolved by making the largest
         loadings for each component in the left singular vectors positive.
 
+    subtract_mean : bool
+        Whether the mean  of `A` should be subtracted after each multiplication
+        by the `A` matrix. This is equivalent to multiplying matrices by a
+        centered `A` without ever having to explicitly center. This is
+        especially useful for performing PCA on large sparse matrices, so they
+        do not need to be centered.
+
     random_state : int, RandomState instance or None, optional (default=None)
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
@@ -333,11 +359,20 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
         # this implementation is a bit faster with smaller shape[1]
         M = M.T
 
-    Q = randomized_range_finder(M, n_random, n_iter,
-                                power_iteration_normalizer, random_state)
+    Q = randomized_range_finder(
+        M,
+        size=n_random,
+        n_iter=n_iter,
+        power_iteration_normalizer=power_iteration_normalizer,
+        subtract_mean=subtract_mean,
+        random_state=random_state,
+    )
 
     # project M to the (k + p) dimensional space using the basis vectors
     B = safe_sparse_dot(Q.T, M)
+    if subtract_mean:
+        c = M.mean(axis=0).reshape((1, -1))
+        B -= np.dot(c.T, Q.sum(axis=0).reshape((1, -1))).T
 
     # compute the SVD on the thin matrix: (k + p) wide
     Uhat, s, V = linalg.svd(B, full_matrices=False)