scikit-learn
diff --git a/‎sklearn/decomposition/pca.py
Lines changed: 67 additions & 92 deletions b/‎sklearn/decomposition/pca.py
Lines changed: 67 additions & 92 deletions
@@ -103,15 +103,10 @@ class PCA(_BasePCA):
     """Principal component analysis (PCA)
 
     Linear dimensionality reduction using Singular Value Decomposition of the
-    data and keeping only the most significant singular vectors to project the
-    data to a lower dimensional space.
+    data to project it to a lower dimensional space.
 
-    This implementation uses the scipy.linalg implementation of the singular
-    value decomposition. It only works for dense arrays and is not scalable to
-    large dimensional data.
-
-    The time complexity of this implementation is ``O(n ** 3)`` assuming
-    n ~ n_samples ~ n_features.
+    It uses the scipy.linalg implementation of the SVD or a randomized SVD
+    by the method of Halko et al. 2009, which one is the most efficient.
 
     Read more in the :ref:`User Guide <PCA>`.
 
@@ -128,6 +123,7 @@ class PCA(_BasePCA):
         if ``0 < n_components < 1`` and svd_solver == 'full', select the number
         of components such that the amount of variance that needs to be
         explained is greater than the percentage specified by n_components
+        n_components cannot be equal to n_features for svd_solver == 'arpack'.
 
     copy : bool (default False)
         If False, data passed to fit are overwritten and running
@@ -146,22 +142,22 @@ class PCA(_BasePCA):
 
     svd_solver : string (default 'auto')
         The algorithm that runs SVD
-        if svd_solver == 'full', run exact SVD and select the components as
+        if svd_solver == 'full', run exact SVD and select the components by
         postprocessing
         if svd_solver == 'arpack', run SVD truncated to n_components calling
         `scipy.sparse.linalg.svds`. 0 < n_components < X.shape[1] (stricly)
         if svd_solver == 'randomized', run randomized SVD by the method of
         Halko et al.
-        if svd_solver == 'auto'
+        if svd_solver == 'auto':
             if n_components >= .8 * min(n_samples, n_features), run with 'full'
             otherwise 'randomized'
 
-    tol : float >= 0, optional (deaful .0)
-        Tolerance for singular values computed by svd_solver == 'arpack'.
+    tol : float >= 0, optional (default .0)
+        Tolerance for singular values computed by svd_solver=='arpack'.
 
     iterated_power : int >= 0, optional (default 3)
-        Number of iterations for the power method computed by svd_solver ==
-        'randomized'.
+        Number of iterations for the power method computed by
+        svd_solver=='randomized'.
 
     random_state : int or RandomState instance or None (default None)
         Pseudo Random Number generator seed control. If None, use the
@@ -185,9 +181,10 @@ class PCA(_BasePCA):
         Per-feature empirical mean, estimated from the training set.
 
     n_components_ : int
-        The estimated number of components. Relevant when n_components is set
-        to 'mle' or a number between 0 and 1 to select using explained
-        variance.
+        The estimated number of components. When n_components is set
+        to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this
+        number is estimated from input data. Otherwise it equals the parameter
+        n_components, or n_features if n_components is None.
 
     noise_variance_ : float
         The estimated noise covariance following the Probabilistic PCA model
@@ -198,7 +195,7 @@ class PCA(_BasePCA):
 
     References
     -----
-    For n_components='mle', this class uses the method of `Thomas P. Minka:
+    For n_components == 'mle', this class uses the method of `Thomas P. Minka:
     Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604`
 
     Implements the probabilistic PCA model from:
@@ -207,19 +204,21 @@ class PCA(_BasePCA):
     via the score and score_samples methods.
     See http://www.miketipping.com/papers/met-mppca.pdf
 
-    Due to implementation subtleties of the Singular Value Decomposition (SVD),
-    which is used in this implementation, running fit twice on the same matrix
-    can lead to principal components with signs flipped (change in direction).
-    For this reason, it is important to always use the same estimator object to
-    transform data in a consistent fashion.
+    For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`.
 
-    [Halko2009] `Finding structure with randomness: Stochastic algorithms
+    For svd_solver == 'randomized', see:
+    `Finding structure with randomness: Stochastic algorithms
     for constructing approximate matrix decompositions Halko, et al., 2009
     (arXiv:909)`
-
-    [MRT] `A randomized algorithm for the decomposition of matrices
+    `A randomized algorithm for the decomposition of matrices
     Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert`
 
+    Due to implementation subtleties of the Singular Value Decomposition (SVD),
+    which is used in this implementation, running it twice on the same matrix
+    can lead to principal components with signs flipped (change in direction).
+    For this reason, it is important to always use the same estimator object to
+    transform data in a consistent fashion.
+
     Examples
     --------
     >>> import numpy as np
@@ -232,19 +231,26 @@ class PCA(_BasePCA):
     >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
     [ 0.99244...  0.00755...]
 
-
-    >>> pca = PCA(n_components=2, svd_solver='randomized')
+    >>> pca = PCA(n_components=2, svd_solver='full')
     >>> pca.fit(X)                 # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
     PCA(copy=True, iterated_power=3, n_components=2, random_state=None,
-      svd_solver='randomized', tol=0.0, whiten=False)
+      svd_solver='full', tol=0.0, whiten=False)
     >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
     [ 0.99244...  0.00755...]
 
+    >>> pca = PCA(n_components=1, svd_solver='arpack')
+    >>> pca.fit(X)
+    PCA(copy=True, iterated_power=3, n_components=1, random_state=None,
+      svd_solver='arpack', tol=0.0, whiten=False)
+    >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
+    [ 0.99244...]
+
     See also
     --------
     KernelPCA
     SparsePCA
     TruncatedSVD
+    IncrementalPCA
     """
 
     def __init__(self, n_components=None, copy=True, whiten=False,
@@ -302,55 +308,45 @@ def fit_transform(self, X, y=None):
         return U
 
     def _fit(self, X):
-        """Fit the model on X
-
-        Parameters
-        ----------
-        X: array-like, shape (n_samples, n_features)
-            Training vector, where n_samples in the number of samples and
-            n_features is the number of features.
-
-        Returns
-        -------
-        U, s, V : ndarrays
-            The SVD of the input data, copied and centered when
-            requested.
+        """Dispatch the actual fitting to _fit_full and _fit_truncated, after
+        handling svd_solver='auto' policy.
         """
         X = check_array(X, dtype=[np.float32, np.float64], ensure_2d=True,
                         copy=self.copy)
-        self.n_samples_, self.n_features_ = X.shape
 
-        # Handle n_components
+        # Handle n_components==None
         if self.n_components is None:
-            self.n_components_ = self.n_features_
+            n_components = X.shape[1]
         else:
-            self.n_components_ = self.n_components
+            n_components = self.n_components
 
         # Handle svd_solver
         svd_solver = self.svd_solver
         if svd_solver == 'auto':
-            if self.n_components_ < .8 * min(X.shape):
+            if n_components < .8 * min(X.shape):
                 svd_solver = 'randomized'
             else:
                 svd_solver = 'full'
 
         # Call different fits, whether we compute full or truncated SVD
         if svd_solver == 'full':
-            return self._fit_full(X)
+            return self._fit_full(X, n_components)
         elif svd_solver in ['arpack', 'randomized']:
-            return self._fit_truncated(X, svd_solver)
+            return self._fit_truncated(X, n_components, svd_solver)
 
-    def _fit_full(self, X):
+    def _fit_full(self, X, n_components):
         """Fit the model by computing full SVD on X
         """
-        n_samples, n_features = self.n_samples_, self.n_features_
-        n_components = self.n_components_
+        n_samples, n_features = X.shape
+
         if n_components == 'mle':
             if n_samples < n_features:
                 raise ValueError("n_components='mle' is only supported "
                                  "if n_samples >= n_features")
         elif not 0 <= n_components <= n_features:
-            raise ValueError("n_components=%r invalid. See the documentation")
+            raise ValueError("n_components=%r must be between 0 and "
+                             "n_features=%r with svd_solver='full'"
+                             % (n_components, n_features))
 
         # Center data
         self.mean_ = np.mean(X, axis=0)
@@ -381,6 +377,7 @@ def _fit_full(self, X):
         else:
             self.noise_variance_ = 0.
 
+        self.n_samples_, self.n_features_ = n_samples, n_features
         self.components_ = components_[:n_components]
         self.n_components_ = n_components
         self.explained_variance_ = explained_variance_[:n_components]
@@ -389,18 +386,24 @@ def _fit_full(self, X):
 
         return U, S, V
 
-    def _fit_truncated(self, X, svd_solver):
+    def _fit_truncated(self, X, n_components, svd_solver):
         """Fit the model by computing truncated SVD (by Arpack or randomized)
         on X
         """
-        n_samples, n_features = self.n_samples_, self.n_features_
-        n_components = self.n_components_
-        if not 1 <= n_components <= n_features:
-            raise ValueError("n_components=%r invalid for svd_solver='%s'"
+        n_samples, n_features = X.shape
+
+        if type(n_components) == str:
+            raise ValueError("n_components=%r cannot be a string "
+                             "with svd_solver='%s'"
                              % (n_components, svd_solver))
+        elif not 1 <= n_components <= n_features:
+            raise ValueError("n_components=%r must be between 1 and "
+                             "n_features=%r with svd_solver='%s'"
+                             % (n_components, n_features, svd_solver))
         elif svd_solver == 'arpack' and n_components == n_features:
-            raise ValueError("n_components=%r invalid for svd_solver='%s'"
-                             % (n_components, svd_solver))
+            raise ValueError("n_components=%r must be stricly less than "
+                             "n_features=%r with svd_solver='%s'"
+                             % (n_components, n_features, svd_solver))
 
         # Center data
         self.mean_ = np.mean(X, axis=0)
@@ -418,7 +421,9 @@ def _fit_truncated(self, X, svd_solver):
                                      n_iter=self.iterated_power,
                                      random_state=random_state)
 
+        self.n_samples_, self.n_features_ = n_samples, n_features
         self.components_ = V
+        self.n_components_ = n_components
 
         # Get variance explained by singular values
         self.explained_variance_ = (S ** 2) / n_samples
@@ -433,37 +438,6 @@ def _fit_truncated(self, X, svd_solver):
 
         return U, S, V
 
-    def get_precision(self):
-        """Compute data precision matrix with the generative model.
-
-        Equals the inverse of the covariance but computed with
-        the matrix inversion lemma for efficiency.
-
-        Returns
-        -------
-        precision : array, shape=(n_features, n_features)
-            Estimated precision of data.
-        """
-        n_features = self.n_features_
-
-        # handle corner cases first
-        if self.n_components_ == 0:
-            return np.eye(n_features) / self.noise_variance_
-        if self.n_components_ == n_features:
-            return linalg.inv(self.get_covariance())
-
-        # Get precision using matrix inversion lemma
-        components_ = self.components_
-        exp_var = self.explained_variance_
-        exp_var_diff = np.maximum(exp_var - self.noise_variance_, 0.)
-        precision = np.dot(components_, components_.T) / self.noise_variance_
-        precision.flat[::len(precision) + 1] += 1. / exp_var_diff
-        precision = np.dot(components_.T,
-                           np.dot(linalg.inv(precision), components_))
-        precision /= -(self.noise_variance_ ** 2)
-        precision.flat[::len(precision) + 1] += 1. / self.noise_variance_
-        return precision
-
     def score_samples(self, X):
         """Return the log-likelihood of each sample
 
@@ -514,7 +488,8 @@ def score(self, X, y=None):
 
 
 @deprecated("it will be removed in 0.19. Use PCA(svd_solver='randomized') "
-            "instead ")
+            "instead. The new implementation DOES NOT store"
+            "whithen components_. Apply transform to get them.")
 def RandomizedPCA(n_components=None, copy=True, iterated_power=3,
                   whiten=False, random_state=None):
         return PCA(n_components=n_components, copy=copy, whiten=whiten,