scikit-learn
diff --git a/‎sklearn/decomposition/pca.py
Lines changed: 22 additions & 15 deletions b/‎sklearn/decomposition/pca.py
Lines changed: 22 additions & 15 deletions
diff --git a/‎sklearn/decomposition/tests/test_pca.py
Lines changed: 44 additions & 8 deletions b/‎sklearn/decomposition/tests/test_pca.py
Lines changed: 44 additions & 8 deletions
diff --git a/‎sklearn/utils/extmath.py
Lines changed: 147 additions & 0 deletions b/‎sklearn/utils/extmath.py
Lines changed: 147 additions & 0 deletions
@@ -24,9 +24,10 @@
 from .base import _BasePCA
 from ..utils import check_random_state
 from ..utils import check_array
-from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
+from ..utils.extmath import fast_logdet, randomized_pca, svd_flip
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
+from ..utils.sparsefuncs import mean_variance_axis
 
 
 def _assess_dimension_(spectrum, rank, n_samples, n_features):
@@ -370,14 +371,8 @@ def fit_transform(self, X, y=None):
 
     def _fit(self, X):
         """Dispatch to the right submethod depending on the chosen solver."""
-
-        # Raise an error for sparse input.
-        # This is more informative than the generic one raised by check_array.
-        if issparse(X):
-            raise TypeError('PCA does not support sparse input. See '
-                            'TruncatedSVD for a possible alternative.')
-
-        X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True,
+        X = check_array(X, accept_sparse=['csr', 'csc'],
+                        dtype=[np.float64, np.float32], ensure_2d=True,
                         copy=self.copy)
 
         # Handle n_components==None
@@ -392,15 +387,24 @@ def _fit(self, X):
         # Handle svd_solver
         self._fit_svd_solver = self.svd_solver
         if self._fit_svd_solver == 'auto':
+            # Sparse data can only be handled with the randomized solver
+            if issparse(X):
+                self._fit_svd_solver = 'randomized'
             # Small problem or n_components == 'mle', just call full PCA
-            if max(X.shape) <= 500 or n_components == 'mle':
+            elif max(X.shape) <= 500 or n_components == 'mle':
                 self._fit_svd_solver = 'full'
             elif n_components >= 1 and n_components < .8 * min(X.shape):
                 self._fit_svd_solver = 'randomized'
             # This is also the case of n_components in (0,1)
             else:
                 self._fit_svd_solver = 'full'
 
+        # Ensure we don't try call arpack or full on a sparse matrix
+        if issparse(X) and self._fit_svd_solver != 'randomized':
+            raise ValueError(
+                'only the randomized solver supports sparse matrices'
+            )
+
         # Call different fits for either full or truncated SVD
         if self._fit_svd_solver == 'full':
             return self._fit_full(X, n_components)
@@ -503,11 +507,15 @@ def _fit_truncated(self, X, n_components, svd_solver):
 
         random_state = check_random_state(self.random_state)
 
-        # Center data
-        self.mean_ = np.mean(X, axis=0)
-        X -= self.mean_
+        if issparse(X):
+            self.mean_, total_var = mean_variance_axis(X, axis=0, ddof=1)
+        else:
+            self.mean_ = np.mean(X, axis=0)
+            total_var = np.var(X, axis=0, ddof=1)
 
         if svd_solver == 'arpack':
+            # Center data
+            X -= self.mean_
             # random init solution, as ARPACK does it internally
             v0 = random_state.uniform(-1, 1, size=min(X.shape))
             U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0)
@@ -519,7 +527,7 @@ def _fit_truncated(self, X, n_components, svd_solver):
 
         elif svd_solver == 'randomized':
             # sign flipping is done inside
-            U, S, V = randomized_svd(X, n_components=n_components,
+            U, S, V = randomized_pca(X, n_components=n_components,
                                      n_iter=self.iterated_power,
                                      flip_sign=True,
                                      random_state=random_state)
@@ -530,7 +538,6 @@ def _fit_truncated(self, X, n_components, svd_solver):
 
         # Get variance explained by singular values
         self.explained_variance_ = (S ** 2) / (n_samples - 1)
-        total_var = np.var(X, ddof=1, axis=0)
         self.explained_variance_ratio_ = \
             self.explained_variance_ / total_var.sum()
         self.singular_values_ = S.copy()  # Store the singular values.
 
@@ -14,6 +14,7 @@
 from sklearn.utils.testing import assert_no_warnings
 from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.testing import assert_less
+from sklearn.utils.testing import assert_allclose
 
 from sklearn import datasets
 from sklearn.decomposition import PCA
@@ -260,11 +261,11 @@ def test_singular_values():
               random_state=rng).fit(X)
     apca = PCA(n_components=2, svd_solver='arpack',
                random_state=rng).fit(X)
-    rpca = PCA(n_components=2, svd_solver='randomized',
+    rpca = PCA(n_components=2, svd_solver='randomized', iterated_power=40,
                random_state=rng).fit(X)
     assert_array_almost_equal(pca.singular_values_, apca.singular_values_, 12)
-    assert_array_almost_equal(pca.singular_values_, rpca.singular_values_, 1)
-    assert_array_almost_equal(apca.singular_values_, rpca.singular_values_, 1)
+    assert_array_almost_equal(pca.singular_values_, rpca.singular_values_, 12)
+    assert_array_almost_equal(apca.singular_values_, rpca.singular_values_, 12)
 
     # Compare to the Frobenius norm
     X_pca = pca.transform(X)
@@ -283,7 +284,7 @@ def test_singular_values():
     assert_array_almost_equal(apca.singular_values_,
                               np.sqrt(np.sum(X_apca**2.0, axis=0)), 12)
     assert_array_almost_equal(rpca.singular_values_,
-                              np.sqrt(np.sum(X_rpca**2.0, axis=0)), 2)
+                              np.sqrt(np.sum(X_rpca**2.0, axis=0)), 12)
 
     # Set the singular values and see what we get back
     rng = np.random.RandomState(0)
@@ -305,6 +306,7 @@ def test_singular_values():
     pca.fit(X_hat)
     apca.fit(X_hat)
     rpca.fit(X_hat)
+
     assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
     assert_array_almost_equal(apca.singular_values_, [3.142, 2.718, 1.0], 14)
     assert_array_almost_equal(rpca.singular_values_, [3.142, 2.718, 1.0], 14)
@@ -683,15 +685,49 @@ def test_svd_solver_auto():
     assert_array_almost_equal(pca.components_, pca_test.components_)
 
 
-def test_pca_sparse_input(svd_solver):
+def test_pca_sparse_input_randomized_solver():
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    n_features = 80
+
+    X = rng.binomial(1, 0.1, (n_samples, n_features))
+    X_sp = sp.sparse.csr_matrix(X)
+
+    # Compute the complete decomposition on the dense matrix
+    pca = PCA(n_components=3, svd_solver='randomized',
+              random_state=0).fit(X)
+    # And compute a randomized decomposition on the sparse matrix. Increase the
+    # number of power iterations to account for the non-zero means
+    pca_sp = PCA(n_components=3, svd_solver='randomized',
+                 random_state=0).fit(X_sp)
+
+    # Ensure the singular values are close to the exact singular values
+    assert_allclose(pca_sp.singular_values_, pca.singular_values_)
+
+    # Ensure that the basis is close to the true basis
+    X_pca = pca.transform(X)
+    X_sppca = pca_sp.transform(X)
+    assert_allclose(X_sppca, X_pca)
+
+
+@pytest.mark.parametrize('svd_solver', ['full', 'arpack'])
+def test_pca_sparse_input_bad_solvers(svd_solver):
     X = np.random.RandomState(0).rand(5, 4)
     X = sp.sparse.csr_matrix(X)
-    assert(sp.sparse.issparse(X))
 
     pca = PCA(n_components=3, svd_solver=svd_solver)
 
     assert_raises(TypeError, pca.fit, X)
+    assert_raises(ValueError, pca.fit, X)
+
+
+def test_pca_auto_solver_selects_randomized_solver_for_sparse_matrices():
+    X = np.random.RandomState(0).rand(5, 4)
+    X = sp.sparse.csr_matrix(X)
+
+    pca = PCA(n_components=3, svd_solver='auto')
+    pca.fit(X)
+
+    assert pca._fit_svd_solver == 'randomized'
 
 
 def test_pca_bad_solver():
 
@@ -360,6 +360,153 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
         return U[:, :n_components], s[:n_components], V[:n_components, :]
 
 
+def _normalize_power_iteration(x, power_iteration_normalizer):
+    """Normalize the matrix when doing power iterations for stability."""
+    if power_iteration_normalizer == "none":
+        return x
+    elif power_iteration_normalizer == "LU":
+        Q, _ = linalg.lu(x, permute_l=True)
+        return Q
+    elif power_iteration_normalizer == "QR":
+        Q, _ = linalg.qr(x, mode="economic")
+        return Q
+    else:
+        raise ValueError("Unrecognized normalization method `%s`" %
+                         power_iteration_normalizer)
+
+
+def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto",
+                   power_iteration_normalizer="auto", flip_sign=True,
+                   random_state=0):
+    """Computes a truncated randomized PCA decomposition.
+
+    Parameters
+    ----------
+    A : ndarray or sparse matrix
+        Matrix to decompose
+
+    n_components : int
+        Number of singular values and vectors to extract.
+
+    n_oversamples : int (default is 10)
+        Additional number of random vectors to sample the range of M so as
+        to ensure proper conditioning. The total number of random vectors
+        used to find the range of M is n_components + n_oversamples. Smaller
+        number can improve speed but can negatively impact the quality of
+        approximation of singular vectors and singular values.
+
+    n_iter : int or 'auto' (default is 'auto')
+        Number of power iterations. It can be used to deal with very noisy
+        problems. When 'auto', it is set to 4, unless `n_components` is small
+        (< .1 * min(X.shape)) `n_iter` in which case is set to 7.
+        This improves precision with few components.
+
+        .. versionchanged:: 0.18
+
+    power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none'
+        Whether the power iterations are normalized with step-by-step
+        QR factorization (the slowest but most accurate), 'none'
+        (the fastest but numerically unstable when `n_iter` is large, e.g.
+        typically 5 or larger), or 'LU' factorization (numerically stable
+        but can lose slightly in accuracy). The 'auto' mode applies no
+        normalization if `n_iter` <= 2 and switches to LU otherwise.
+
+        .. versionadded:: 0.18
+
+    flip_sign : boolean, (True by default)
+        The output of a singular value decomposition is only unique up to a
+        permutation of the signs of the singular vectors. If `flip_sign` is
+        set to `True`, the sign ambiguity is resolved by making the largest
+        loadings for each component in the left singular vectors positive.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
+
+    Notes
+    -----
+    This algorithm finds a (usually very good) approximate truncated principal
+    component analysis decomposition using randomized methods to speed up the
+    computations. It is particulary useful on large, sparse matrices since this
+    implementation doesn't require centering the original matrix (which would
+    center and therefore densify potentially large sparse matrices, leading to
+    memory issues). In order to obtain further speed up, `n_iter` can be set
+    <=2 (at the cost of loss of precision).
+
+    References
+    ----------
+    * Algorithm 971: An implementation of a randomized algorithm for principal
+      component analysis
+      Li, Huamin, et al. 2017
+
+    """
+    if n_iter == "auto":
+        # Checks if the number of iterations is explicitly specified
+        # Adjust n_iter. 7 was found a good compromise for PCA. See sklearn #5299
+        n_iter = 7 if n_components < .1 * min(A.shape) else 4
+
+    # Deal with "auto" mode
+    if power_iteration_normalizer == "auto":
+        if n_iter <= 2:
+            power_iteration_normalizer = "none"
+        else:
+            power_iteration_normalizer = "LU"
+
+    n_samples, n_features = A.shape
+
+    c = np.atleast_2d(A.mean(axis=0))
+
+    if n_samples >= n_features:
+        Q = random_state.normal(size=(n_features, n_components + n_oversamples))
+        if A.dtype.kind == "f":
+            # Ensure f32 is preserved as f32
+            Q = Q.astype(A.dtype, copy=False)
+
+        Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
+
+        # Normalized power iterations
+        for _ in range(n_iter):
+            Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
+            Q = _normalize_power_iteration(Q, power_iteration_normalizer)
+            Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
+            Q = _normalize_power_iteration(Q, power_iteration_normalizer)
+
+        Q, _ = linalg.qr(Q, mode="economic")
+
+        QA = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
+        R, s, V = linalg.svd(QA.T, full_matrices=False)
+        U = Q.dot(R)
+
+    else:  # n_features > n_samples
+        Q = random_state.normal(size=(n_samples, n_components + n_oversamples))
+        if A.dtype.kind == "f":
+            # Ensure f32 is preserved as f32
+            Q = Q.astype(A.dtype, copy=False)
+
+        Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
+
+        # Normalized power iterations
+        for _ in range(n_iter):
+            Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
+            Q = _normalize_power_iteration(Q, power_iteration_normalizer)
+            Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
+            Q = _normalize_power_iteration(Q, power_iteration_normalizer)
+
+        Q, _ = linalg.qr(Q, mode="economic")
+
+        QA = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
+        U, s, R = linalg.svd(QA, full_matrices=False)
+        V = R.dot(Q.T)
+
+    if flip_sign:
+        U, V = svd_flip(U, V)
+
+    return U[:, :n_components], s[:n_components], V[:n_components, :]
+
+
 def weighted_mode(a, w, axis=0):
     """Returns an array of the weighted modal (most common) value in a