@@ -103,15 +103,10 @@ class PCA(_BasePCA):
103
103
"""Principal component analysis (PCA)
104
104
105
105
Linear dimensionality reduction using Singular Value Decomposition of the
106
- data and keeping only the most significant singular vectors to project the
107
- data to a lower dimensional space.
106
+ data to project it to a lower dimensional space.
108
107
109
- This implementation uses the scipy.linalg implementation of the singular
110
- value decomposition. It only works for dense arrays and is not scalable to
111
- large dimensional data.
112
-
113
- The time complexity of this implementation is ``O(n ** 3)`` assuming
114
- n ~ n_samples ~ n_features.
108
+ It uses the scipy.linalg implementation of the SVD or a randomized SVD
109
+ by the method of Halko et al. 2009, which one is the most efficient.
115
110
116
111
Read more in the :ref:`User Guide <PCA>`.
117
112
@@ -128,6 +123,7 @@ class PCA(_BasePCA):
128
123
if ``0 < n_components < 1`` and svd_solver == 'full', select the number
129
124
of components such that the amount of variance that needs to be
130
125
explained is greater than the percentage specified by n_components
126
+ n_components cannot be equal to n_features for svd_solver == 'arpack'.
131
127
132
128
copy : bool (default False)
133
129
If False, data passed to fit are overwritten and running
@@ -146,22 +142,22 @@ class PCA(_BasePCA):
146
142
147
143
svd_solver : string (default 'auto')
148
144
The algorithm that runs SVD
149
- if svd_solver == 'full', run exact SVD and select the components as
145
+ if svd_solver == 'full', run exact SVD and select the components by
150
146
postprocessing
151
147
if svd_solver == 'arpack', run SVD truncated to n_components calling
152
148
`scipy.sparse.linalg.svds`. 0 < n_components < X.shape[1] (stricly)
153
149
if svd_solver == 'randomized', run randomized SVD by the method of
154
150
Halko et al.
155
- if svd_solver == 'auto'
151
+ if svd_solver == 'auto':
156
152
if n_components >= .8 * min(n_samples, n_features), run with 'full'
157
153
otherwise 'randomized'
158
154
159
- tol : float >= 0, optional (deaful .0)
160
- Tolerance for singular values computed by svd_solver == 'arpack'.
155
+ tol : float >= 0, optional (default .0)
156
+ Tolerance for singular values computed by svd_solver== 'arpack'.
161
157
162
158
iterated_power : int >= 0, optional (default 3)
163
- Number of iterations for the power method computed by svd_solver ==
164
- 'randomized'.
159
+ Number of iterations for the power method computed by
160
+ svd_solver== 'randomized'.
165
161
166
162
random_state : int or RandomState instance or None (default None)
167
163
Pseudo Random Number generator seed control. If None, use the
@@ -185,9 +181,10 @@ class PCA(_BasePCA):
185
181
Per-feature empirical mean, estimated from the training set.
186
182
187
183
n_components_ : int
188
- The estimated number of components. Relevant when n_components is set
189
- to 'mle' or a number between 0 and 1 to select using explained
190
- variance.
184
+ The estimated number of components. When n_components is set
185
+ to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this
186
+ number is estimated from input data. Otherwise it equals the parameter
187
+ n_components, or n_features if n_components is None.
191
188
192
189
noise_variance_ : float
193
190
The estimated noise covariance following the Probabilistic PCA model
@@ -198,7 +195,7 @@ class PCA(_BasePCA):
198
195
199
196
References
200
197
-----
201
- For n_components= 'mle', this class uses the method of `Thomas P. Minka:
198
+ For n_components == 'mle', this class uses the method of `Thomas P. Minka:
202
199
Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604`
203
200
204
201
Implements the probabilistic PCA model from:
@@ -207,19 +204,21 @@ class PCA(_BasePCA):
207
204
via the score and score_samples methods.
208
205
See http://www.miketipping.com/papers/met-mppca.pdf
209
206
210
- Due to implementation subtleties of the Singular Value Decomposition (SVD),
211
- which is used in this implementation, running fit twice on the same matrix
212
- can lead to principal components with signs flipped (change in direction).
213
- For this reason, it is important to always use the same estimator object to
214
- transform data in a consistent fashion.
207
+ For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`.
215
208
216
- [Halko2009] `Finding structure with randomness: Stochastic algorithms
209
+ For svd_solver == 'randomized', see:
210
+ `Finding structure with randomness: Stochastic algorithms
217
211
for constructing approximate matrix decompositions Halko, et al., 2009
218
212
(arXiv:909)`
219
-
220
- [MRT] `A randomized algorithm for the decomposition of matrices
213
+ `A randomized algorithm for the decomposition of matrices
221
214
Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert`
222
215
216
+ Due to implementation subtleties of the Singular Value Decomposition (SVD),
217
+ which is used in this implementation, running it twice on the same matrix
218
+ can lead to principal components with signs flipped (change in direction).
219
+ For this reason, it is important to always use the same estimator object to
220
+ transform data in a consistent fashion.
221
+
223
222
Examples
224
223
--------
225
224
>>> import numpy as np
@@ -232,19 +231,26 @@ class PCA(_BasePCA):
232
231
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
233
232
[ 0.99244... 0.00755...]
234
233
235
-
236
- >>> pca = PCA(n_components=2, svd_solver='randomized')
234
+ >>> pca = PCA(n_components=2, svd_solver='full')
237
235
>>> pca.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
238
236
PCA(copy=True, iterated_power=3, n_components=2, random_state=None,
239
- svd_solver='randomized ', tol=0.0, whiten=False)
237
+ svd_solver='full ', tol=0.0, whiten=False)
240
238
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
241
239
[ 0.99244... 0.00755...]
242
240
241
+ >>> pca = PCA(n_components=1, svd_solver='arpack')
242
+ >>> pca.fit(X)
243
+ PCA(copy=True, iterated_power=3, n_components=1, random_state=None,
244
+ svd_solver='arpack', tol=0.0, whiten=False)
245
+ >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
246
+ [ 0.99244...]
247
+
243
248
See also
244
249
--------
245
250
KernelPCA
246
251
SparsePCA
247
252
TruncatedSVD
253
+ IncrementalPCA
248
254
"""
249
255
250
256
def __init__ (self , n_components = None , copy = True , whiten = False ,
@@ -302,55 +308,45 @@ def fit_transform(self, X, y=None):
302
308
return U
303
309
304
310
def _fit (self , X ):
305
- """Fit the model on X
306
-
307
- Parameters
308
- ----------
309
- X: array-like, shape (n_samples, n_features)
310
- Training vector, where n_samples in the number of samples and
311
- n_features is the number of features.
312
-
313
- Returns
314
- -------
315
- U, s, V : ndarrays
316
- The SVD of the input data, copied and centered when
317
- requested.
311
+ """Dispatch the actual fitting to _fit_full and _fit_truncated, after
312
+ handling svd_solver='auto' policy.
318
313
"""
319
314
X = check_array (X , dtype = [np .float32 , np .float64 ], ensure_2d = True ,
320
315
copy = self .copy )
321
- self .n_samples_ , self .n_features_ = X .shape
322
316
323
- # Handle n_components
317
+ # Handle n_components==None
324
318
if self .n_components is None :
325
- self . n_components_ = self . n_features_
319
+ n_components = X . shape [ 1 ]
326
320
else :
327
- self . n_components_ = self .n_components
321
+ n_components = self .n_components
328
322
329
323
# Handle svd_solver
330
324
svd_solver = self .svd_solver
331
325
if svd_solver == 'auto' :
332
- if self . n_components_ < .8 * min (X .shape ):
326
+ if n_components < .8 * min (X .shape ):
333
327
svd_solver = 'randomized'
334
328
else :
335
329
svd_solver = 'full'
336
330
337
331
# Call different fits, whether we compute full or truncated SVD
338
332
if svd_solver == 'full' :
339
- return self ._fit_full (X )
333
+ return self ._fit_full (X , n_components )
340
334
elif svd_solver in ['arpack' , 'randomized' ]:
341
- return self ._fit_truncated (X , svd_solver )
335
+ return self ._fit_truncated (X , n_components , svd_solver )
342
336
343
- def _fit_full (self , X ):
337
+ def _fit_full (self , X , n_components ):
344
338
"""Fit the model by computing full SVD on X
345
339
"""
346
- n_samples , n_features = self . n_samples_ , self . n_features_
347
- n_components = self . n_components_
340
+ n_samples , n_features = X . shape
341
+
348
342
if n_components == 'mle' :
349
343
if n_samples < n_features :
350
344
raise ValueError ("n_components='mle' is only supported "
351
345
"if n_samples >= n_features" )
352
346
elif not 0 <= n_components <= n_features :
353
- raise ValueError ("n_components=%r invalid. See the documentation" )
347
+ raise ValueError ("n_components=%r must be between 0 and "
348
+ "n_features=%r with svd_solver='full'"
349
+ % (n_components , n_features ))
354
350
355
351
# Center data
356
352
self .mean_ = np .mean (X , axis = 0 )
@@ -381,6 +377,7 @@ def _fit_full(self, X):
381
377
else :
382
378
self .noise_variance_ = 0.
383
379
380
+ self .n_samples_ , self .n_features_ = n_samples , n_features
384
381
self .components_ = components_ [:n_components ]
385
382
self .n_components_ = n_components
386
383
self .explained_variance_ = explained_variance_ [:n_components ]
@@ -389,18 +386,24 @@ def _fit_full(self, X):
389
386
390
387
return U , S , V
391
388
392
- def _fit_truncated (self , X , svd_solver ):
389
+ def _fit_truncated (self , X , n_components , svd_solver ):
393
390
"""Fit the model by computing truncated SVD (by Arpack or randomized)
394
391
on X
395
392
"""
396
- n_samples , n_features = self .n_samples_ , self .n_features_
397
- n_components = self .n_components_
398
- if not 1 <= n_components <= n_features :
399
- raise ValueError ("n_components=%r invalid for svd_solver='%s'"
393
+ n_samples , n_features = X .shape
394
+
395
+ if type (n_components ) == str :
396
+ raise ValueError ("n_components=%r cannot be a string "
397
+ "with svd_solver='%s'"
400
398
% (n_components , svd_solver ))
399
+ elif not 1 <= n_components <= n_features :
400
+ raise ValueError ("n_components=%r must be between 1 and "
401
+ "n_features=%r with svd_solver='%s'"
402
+ % (n_components , n_features , svd_solver ))
401
403
elif svd_solver == 'arpack' and n_components == n_features :
402
- raise ValueError ("n_components=%r invalid for svd_solver='%s'"
403
- % (n_components , svd_solver ))
404
+ raise ValueError ("n_components=%r must be stricly less than "
405
+ "n_features=%r with svd_solver='%s'"
406
+ % (n_components , n_features , svd_solver ))
404
407
405
408
# Center data
406
409
self .mean_ = np .mean (X , axis = 0 )
@@ -418,7 +421,9 @@ def _fit_truncated(self, X, svd_solver):
418
421
n_iter = self .iterated_power ,
419
422
random_state = random_state )
420
423
424
+ self .n_samples_ , self .n_features_ = n_samples , n_features
421
425
self .components_ = V
426
+ self .n_components_ = n_components
422
427
423
428
# Get variance explained by singular values
424
429
self .explained_variance_ = (S ** 2 ) / n_samples
@@ -433,37 +438,6 @@ def _fit_truncated(self, X, svd_solver):
433
438
434
439
return U , S , V
435
440
436
- def get_precision (self ):
437
- """Compute data precision matrix with the generative model.
438
-
439
- Equals the inverse of the covariance but computed with
440
- the matrix inversion lemma for efficiency.
441
-
442
- Returns
443
- -------
444
- precision : array, shape=(n_features, n_features)
445
- Estimated precision of data.
446
- """
447
- n_features = self .n_features_
448
-
449
- # handle corner cases first
450
- if self .n_components_ == 0 :
10A6F
tr>451
- return np .eye (n_features ) / self .noise_variance_
452
- if self .n_components_ == n_features :
453
- return linalg .inv (self .get_covariance ())
454
-
455
- # Get precision using matrix inversion lemma
456
- components_ = self .components_
457
- exp_var = self .explained_variance_
458
- exp_var_diff = np .maximum (exp_var - self .noise_variance_ , 0. )
459
- precision = np .dot (components_ , components_ .T ) / self .noise_variance_
460
- precision .flat [::len (precision ) + 1 ] += 1. / exp_var_diff
461
- precision = np .dot (components_ .T ,
462
- np .dot (linalg .inv (precision ), components_ ))
463
- precision /= - (self .noise_variance_ ** 2 )
464
- precision .flat [::len (precision ) + 1 ] += 1. / self .noise_variance_
465
- return precision
466
-
467
441
def score_samples (self , X ):
468
442
"""Return the log-likelihood of each sample
469
443
@@ -514,7 +488,8 @@ def score(self, X, y=None):
514
488
515
489
516
490
@deprecated ("it will be removed in 0.19. Use PCA(svd_solver='randomized') "
517
- "instead " )
491
+ "instead. The new implementation DOES NOT store"
492
+ "whithen components_. Apply transform to get them." )
518
493
def RandomizedPCA (n_components = None , copy = True , iterated_power = 3 ,
519
494
whiten = False , random_state = None ):
520
495
return PCA (n_components = n_components , copy = copy , whiten = whiten ,
0 commit comments