8000 docstring fixed · scikit-learn/scikit-learn@669f136 · GitHub
[go: up one dir, main page]

Skip to content

Commit 669f136

Browse files
author
giorgiop
committed
docstring fixed
1 parent 3125d33 commit 669f136

File tree

2 files changed

+106
-111
lines changed

2 files changed

+106
-111
lines changed

sklearn/decomposition/pca.py

Lines changed: 67 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -103,15 +103,10 @@ class PCA(_BasePCA):
103103
"""Principal component analysis (PCA)
104104
105105
Linear dimensionality reduction using Singular Value Decomposition of the
106-
data and keeping only the most significant singular vectors to project the
107-
data to a lower dimensional space.
106+
data to project it to a lower dimensional space.
108107
109-
This implementation uses the scipy.linalg implementation of the singular
110-
value decomposition. It only works for dense arrays and is not scalable to
111-
large dimensional data.
112-
113-
The time complexity of this implementation is ``O(n ** 3)`` assuming
114-
n ~ n_samples ~ n_features.
108+
It uses the scipy.linalg implementation of the SVD or a randomized SVD
109+
by the method of Halko et al. 2009, which one is the most efficient.
115110
116111
Read more in the :ref:`User Guide <PCA>`.
117112
@@ -128,6 +123,7 @@ class PCA(_BasePCA):
128123
if ``0 < n_components < 1`` and svd_solver == 'full', select the number
129124
of components such that the amount of variance that needs to be
130125
explained is greater than the percentage specified by n_components
126+
n_components cannot be equal to n_features for svd_solver == 'arpack'.
131127
132128
copy : bool (default False)
133129
If False, data passed to fit are overwritten and running
@@ -146,22 +142,22 @@ class PCA(_BasePCA):
146142
147143
svd_solver : string (default 'auto')
148144
The algorithm that runs SVD
149-
if svd_solver == 'full', run exact SVD and select the components as
145+
if svd_solver == 'full', run exact SVD and select the components by
150146
postprocessing
151147
if svd_solver == 'arpack', run SVD truncated to n_components calling
152148
`scipy.sparse.linalg.svds`. 0 < n_components < X.shape[1] (stricly)
153149
if svd_solver == 'randomized', run randomized SVD by the method of
154150
Halko et al.
155-
if svd_solver == 'auto'
151+
if svd_solver == 'auto':
156152
if n_components >= .8 * min(n_samples, n_features), run with 'full'
157153
otherwise 'randomized'
158154
159-
tol : float >= 0, optional (deaful .0)
160-
Tolerance for singular values computed by svd_solver == 'arpack'.
155+
tol : float >= 0, optional (default .0)
156+
Tolerance for singular values computed by svd_solver=='arpack'.
161157
162158
iterated_power : int >= 0, optional (default 3)
163-
Number of iterations for the power method computed by svd_solver ==
164-
'randomized'.
159+
Number of iterations for the power method computed by
160+
svd_solver=='randomized'.
165161
166162
random_state : int or RandomState instance or None (default None)
167163
Pseudo Random Number generator seed control. If None, use the
@@ -185,9 +181,10 @@ class PCA(_BasePCA):
185181
Per-feature empirical mean, estimated from the training set.
186182
187183
n_components_ : int
188-
The estimated number of components. Relevant when n_components is set
189-
to 'mle' or a number between 0 and 1 to select using explained
190-
variance.
184+
The estimated number of components. When n_components is set
185+
to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this
186+
number is estimated from input data. Otherwise it equals the parameter
187+
n_components, or n_features if n_components is None.
191188
192189
noise_variance_ : float
193190
The estimated noise covariance following the Probabilistic PCA model
@@ -198,7 +195,7 @@ class PCA(_BasePCA):
198195
199196
References
200197
-----
201-
For n_components='mle', this class uses the method of `Thomas P. Minka:
198+
For n_components == 'mle', this class uses the method of `Thomas P. Minka:
202199
Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604`
203200
204201
Implements the probabilistic PCA model from:
@@ -207,19 +204,21 @@ class PCA(_BasePCA):
207204
via the score and score_samples methods.
208205
See http://www.miketipping.com/papers/met-mppca.pdf
209206
210-
Due to implementation subtleties of the Singular Value Decomposition (SVD),
211-
which is used in this implementation, running fit twice on the same matrix
212-
can lead to principal components with signs flipped (change in direction).
213-
For this reason, it is important to always use the same estimator object to
214-
transform data in a consistent fashion.
207+
For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`.
215208
216-
[Halko2009] `Finding structure with randomness: Stochastic algorithms
209+
For svd_solver == 'randomized', see:
210+
`Finding structure with randomness: Stochastic algorithms
217211
for constructing approximate matrix decompositions Halko, et al., 2009
218212
(arXiv:909)`
219-
220-
[MRT] `A randomized algorithm for the decomposition of matrices
213+
`A randomized algorithm for the decomposition of matrices
221214
Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert`
222215
216+
Due to implementation subtleties of the Singular Value Decomposition (SVD),
217+
which is used in this implementation, running it twice on the same matrix
218+
can lead to principal components with signs flipped (change in direction).
219+
For this reason, it is important to always use the same estimator object to
220+
transform data in a consistent fashion.
221+
223222
Examples
224223
--------
225224
>>> import numpy as np
@@ -232,19 +231,26 @@ class PCA(_BasePCA):
232231
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
233232
[ 0.99244... 0.00755...]
234233
235-
236-
>>> pca = PCA(n_components=2, svd_solver='randomized')
234+
>>> pca = PCA(n_components=2, svd_solver='full')
237235
>>> pca.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
238236
PCA(copy=True, iterated_power=3, n_components=2, random_state=None,
239-
svd_solver='randomized', tol=0.0, whiten=False)
237+
svd_solver='full', tol=0.0, whiten=False)
240238
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
241239
[ 0.99244... 0.00755...]
242240
241+
>>> pca = PCA(n_components=1, svd_solver='arpack')
242+
>>> pca.fit(X)
243+
PCA(copy=True, iterated_power=3, n_components=1, random_state=None,
244+
svd_solver='arpack', tol=0.0, whiten=False)
245+
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
246+
[ 0.99244...]
247+
243248
See also
244249
--------
245250
KernelPCA
246251
SparsePCA
247252
TruncatedSVD
253+
IncrementalPCA
248254
"""
249255

250256
def __init__(self, n_components=None, copy=True, whiten=False,
@@ -302,55 +308,45 @@ def fit_transform(self, X, y=None):
302308
return U
303309

304310
def _fit(self, X):
305-
"""Fit the model on X
306-
307-
Parameters
308-
----------
309-
X: array-like, shape (n_samples, n_features)
310-
Training vector, where n_samples in the number of samples and
311-
n_features is the number of features.
312-
313-
Returns
314-
-------
315-
U, s, V : ndarrays
316-
The SVD of the input data, copied and centered when
317-
requested.
311+
"""Dispatch the actual fitting to _fit_full and _fit_truncated, after
312+
handling svd_solver='auto' policy.
318313
"""
319314
X = check_array(X, dtype=[np.float32, np.float64], ensure_2d=True,
320315
copy=self.copy)
321-
self.n_samples_, self.n_features_ = X.shape
322316

323-
# Handle n_components
317+
# Handle n_components==None
324318
if self.n_components is None:
325-
self.n_components_ = self.n_features_
319+
n_components = X.shape[1]
326320
else:
327-
self.n_components_ = self.n_components
321+
n_components = self.n_components
328322

329323
# Handle svd_solver
330324
svd_solver = self.svd_solver
331325
if svd_solver == 'auto':
332-
if self.n_components_ < .8 * min(X.shape):
326+
if n_components < .8 * min(X.shape):
333327
svd_solver = 'randomized'
334328
else:
335329
svd_solver = 'full'
336330

337331
# Call different fits, whether we compute full or truncated SVD
338332
if svd_solver == 'full':
339-
return self._fit_full(X)
333+
return self._fit_full(X, n_components)
340334
elif svd_solver in ['arpack', 'randomized']:
341-
return self._fit_truncated(X, svd_solver)
335+
return self._fit_truncated(X, n_components, svd_solver)
342336

343-
def _fit_full(self, X):
337+
def _fit_full(self, X, n_components):
344338
"""Fit the model by computing full SVD on X
345339
"""
346-
n_samples, n_features = self.n_samples_, self.n_features_
347-
n_components = self.n_components_
340+
n_samples, n_features = X.shape
341+
348342
if n_components == 'mle':
349343
if n_samples < n_features:
350344
raise ValueError("n_components='mle' is only supported "
351345
"if n_samples >= n_features")
352346
elif not 0 <= n_components <= n_features:
353-
raise ValueError("n_components=%r invalid. See the documentation")
347+
raise ValueError("n_components=%r must be between 0 and "
348+
"n_features=%r with svd_solver='full'"
349+
% (n_components, n_features))
354350

355351
# Center data
356352
self.mean_ = np.mean(X, axis=0)
@@ -381,6 +377,7 @@ def _fit_full(self, X):
381377
else:
382378
self.noise_variance_ = 0.
383379

380+
self.n_samples_, self.n_features_ = n_samples, n_features
384381
self.components_ = components_[:n_components]
385382
self.n_components_ = n_components
386383
self.explained_variance_ = explained_variance_[:n_components]
@@ -389,18 +386,24 @@ def _fit_full(self, X):
389386

390387
return U, S, V
391388

392-
def _fit_truncated(self, X, svd_solver):
389+
def _fit_truncated(self, X, n_components, svd_solver):
393390
"""Fit the model by computing truncated SVD (by Arpack or randomized)
394391
on X
395392
"""
396-
n_samples, n_features = self.n_samples_, self.n_features_
397-
n_components = self.n_components_
398-
if not 1 <= n_components <= n_features:
399-
raise ValueError("n_components=%r invalid for svd_solver='%s'"
393+
n_samples, n_features = X.shape
394+
395+
if type(n_components) == str:
396+
raise ValueError("n_components=%r cannot be a string "
397+
"with svd_solver='%s'"
400398
% (n_components, svd_solver))
399+
elif not 1 <= n_components <= n_features:
400+
raise ValueError("n_components=%r must be between 1 and "
401+
"n_features=%r with svd_solver='%s'"
402+
% (n_components, n_features, svd_solver))
401403
elif svd_solver == 'arpack' and n_components == n_features:
402-
raise ValueError("n_components=%r invalid for svd_solver='%s'"
403-
% (n_components, svd_solver))
404+
raise ValueError("n_components=%r must be stricly less than "
405+
"n_features=%r with svd_solver='%s'"
406+
% (n_components, n_features, svd_solver))
404407

405408
# Center data
406409
self.mean_ = np.mean(X, axis=0)
@@ -418,7 +421,9 @@ def _fit_truncated(self, X, svd_solver):
418421
n_iter=self.iterated_power,
419422
random_state=random_state)
420423

424+
self.n_samples_, self.n_features_ = n_samples, n_features
421425
self.components_ = V
426+
self.n_components_ = n_components
422427

423428
# Get variance explained by singular values
424429
self.explained_variance_ = (S ** 2) / n_samples
@@ -433,37 +438,6 @@ def _fit_truncated(self, X, svd_solver):
433438

434439
return U, S, V
435440

436-
def get_precision(self):
437-
"""Compute data precision matrix with the generative model.
438-
439-
Equals the inverse of the covariance but computed with
440-
the matrix inversion lemma for efficiency.
441-
442-
Returns
443-
-------
444-
precision : array, shape=(n_features, n_features)
445-
Estimated precision of data.
446-
"""
447-
n_features = self.n_features_
448-
449-
# handle corner cases first
450-
if self.n_components_ == 0:
451-
return np.eye(n_features) / self.noise_variance_
452-
if self.n_components_ == n_features:
453-
return linalg.inv(self.get_covariance())
454-
455-
# Get precision using matrix inversion lemma
456-
components_ = self.components_
457-
exp_var = self.explained_variance_
458-
exp_var_diff = np.maximum(exp_var - self.noise_variance_, 0.)
459-
precision = np.dot(components_, components_.T) / self.noise_variance_
460-
precision.flat[::len(precision) + 1] += 1. / exp_var_diff
461-
precision = np.dot(components_.T,
462-
np.dot(linalg.inv(precision), components_))
463-
precision /= -(self.noise_variance_ ** 2)
464-
precision.flat[::len(precision) + 1] += 1. / self.noise_variance_
465-
return precision
466-
467441
def score_samples(self, X):
468442
"""Return the log-likelihood of each sample
469443
@@ -514,7 +488,8 @@ def score(self, X, y=None):
514488

515489

516490
@deprecated("it will be removed in 0.19. Use PCA(svd_solver='randomized') "
517-
"instead ")
491+
"instead. The new implementation DOES NOT store"
492+
"whithen components_. Apply transform to get them.")
518493
def RandomizedPCA(n_components=None, copy=True, iterated_power=3,
519494
whiten=False, random_state=None):
520495
return PCA(n_components=n_components, copy=copy, whiten=whiten,

0 commit comments

Comments
 (0)
0