@@ -360,6 +360,153 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
360
360
return U [:, :n_components ], s [:n_components ], V [:n_components , :]
361
361
362
362
363
+ def _normalize_power_iteration (x , power_iteration_normalizer ):
364
+ """Normalize the matrix when doing power iterations for stability."""
365
+ if power_iteration_normalizer == "none" :
366
+ return x
367
+ elif power_iteration_normalizer == "LU" :
368
+ Q , _ = linalg .lu (x , permute_l = True )
369
+ return Q
370
+ elif power_iteration_normalizer == "QR" :
371
+ Q , _ = linalg .qr (x , mode = "economic" )
372
+ return Q
373
+ else :
374
+ raise ValueError ("Unrecognized normalization method `%s`" %
375
+ power_iteration_normalizer )
376
+
377
+
378
+ def randomized_pca (A , n_components , n_oversamples = 10 , n_iter = "auto" ,
379
+ power_iteration_normalizer = "auto" , flip_sign = True ,
380
+ random_state = 0 ):
381
+ """Computes a truncated randomized PCA decomposition.
382
+
383
+ Parameters
384
+ ----------
385
+ A : ndarray or sparse matrix
386
+ Matrix to decompose
387
+
388
+ n_components : int
389
+ Number of singular values and vectors to extract.
390
+
391
+ n_oversamples : int (default is 10)
392
+ Additional number of random vectors to sample the range of M so as
393
+ to ensure proper conditioning. The total number of random vectors
394
+ used to find the range of M is n_components + n_oversamples. Smaller
395
+ number can improve speed but can negatively impact the quality of
396
+ approximation of singular vectors and singular values.
397
+
398
+ n_iter : int or 'auto' (default is 'auto')
399
+ Number of power iterations. It can be used to deal with very noisy
400
+ problems. When 'auto', it is set to 4, unless `n_components` is small
401
+ (< .1 * min(X.shape)) `n_iter` in which case is set to 7.
402
+ This improves precision with few components.
403
+
404
+ .. versionchanged:: 0.18
405
+
406
+ power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none'
407
+ Whether the power iterations are normalized with step-by-step
408
+ QR factorization (the slowest but most accurate), 'none'
409
+ (the fastest but numerically unstable when `n_iter` is large, e.g.
410
+ typically 5 or larger), or 'LU' factorization (numerically stable
411
+ but can lose slightly in accuracy). The 'auto' mode applies no
412
+ normalization if `n_iter` <= 2 and switches to LU otherwise.
413
+
414
+ .. versionadded:: 0.18
415
+
416
+ flip_sign : boolean, (True by default)
417
+ The output of a singular value decomposition is only unique up to a
418
+ permutation of the signs of the singular vectors. If `flip_sign` is
419
+ set to `True`, the sign ambiguity is resolved by making the largest
420
+ loadings for each component in the left singular vectors positive.
421
+
422
+ random_state : int, RandomState instance or None, optional (default=None)
423
+ The seed of the pseudo random number generator to use when shuffling
424
+ the data. If int, random_state is the seed used by the random number
425
+ generator; If RandomState instance, random_state is the random number
426
+ generator; If None, the random number generator is the RandomState
427
+ instance used by `np.random`.
428
+
429
+ Notes
430
+ -----
431
+ This algorithm finds a (usually very good) approximate truncated principal
432
+ component analysis decomposition using randomized methods to speed up the
433
+ computations. It is particulary useful on large, sparse matrices since this
434
+ implementation doesn't require centering the original matrix (which would
435
+ center and therefore densify potentially large sparse matrices, leading to
436
+ memory issues). In order to obtain further speed up, `n_iter` can be set
437
+ <=2 (at the cost of loss of precision).
438
+
439
+ References
440
+ ----------
441
+ * Algorithm 971: An implementation of a randomized algorithm for principal
442
+ component analysis
443
+ Li, Huamin, et al. 2017
444
+
445
+ """
446
+ if n_iter == "auto" :
447
+ # Checks if the number of iterations is explicitly specified
448
+ # Adjust n_iter. 7 was found a good compromise for PCA. See sklearn #5299
449
+ n_iter = 7 if n_components < .1 * min (A .shape ) else 4
450
+
451
+ # Deal with "auto" mode
452
+ if power_iteration_normalizer == "auto" :
453
+ if n_iter <= 2 :
454
+ power_iteration_normalizer = "none"
455
+ else :
456
+ power_iteration_normalizer = "LU"
457
+
458
+ n_samples , n_features = A .shape
459
+
460
+ c = np .atleast_2d (A .mean (axis = 0 ))
461
+
462
+ if n_samples >= n_features :
463
+ Q = random_state .normal (size = (n_features , n_components + n_oversamples ))
464
+ if A .dtype .kind == "f" :
465
+ # Ensure f32 is preserved as f32
466
+ Q = Q .astype (A .dtype , copy = False )
467
+
468
+ Q = safe_sparse_dot (A , Q ) - safe_sparse_dot (c , Q )
469
+
470
+ # Normalized power iterations
471
+ for _ in range (n_iter ):
472
+ Q = safe_sparse_dot (A .T , Q ) - safe_sparse_dot (c .T , Q .sum (axis = 0 )[None , :])
473
+ Q = _normalize_power_iteration (Q , power_iteration_normalizer )
474
+ Q = safe_sparse_dot (A , Q ) - safe_sparse_dot (c , Q )
475
+ Q = _normalize_power_iteration (Q , power_iteration_normalizer )
476
+
477
+ Q , _ = linalg .qr (Q , mode = "economic" )
478
+
479
+ QA = safe_sparse_dot (A .T , Q ) - safe_sparse_dot (c .T , Q .sum (axis = 0 )[None , :])
480
+ R , s , V = linalg .svd (QA .T , full_matrices = False )
481
+ U = Q .dot (R )
482
+
483
+ else : # n_features > n_samples
484
+ Q = random_state .normal (size = (n_samples , n_components + n_oversamples ))
485
+ if A .dtype .kind == "f" :
486
+ # Ensure f32 is preserved as f32
487
+ Q = Q .astype (A .dtype , copy = False )
488
+
489
+ Q = safe_sparse_dot (A .T , Q ) - safe_sparse_dot (c .T , Q .sum (axis = 0 )[None , :])
490
+
491
+ # Normalized power iterations
492
+ for _ in range (n_iter ):
493
+ Q = safe_sparse_dot (A , Q ) - safe_sparse_dot (c , Q )
494
+ Q = _normalize_power_iteration (Q , power_iteration_normalizer )
495
+ Q = safe_sparse_dot (A .T , Q ) - safe_sparse_dot (c .T , Q .sum (axis = 0 )[None , :])
496
+ Q = _normalize_power_iteration (Q , power_iteration_normalizer )
497
+
498
+ Q , _ = linalg .qr (Q , mode = "economic" )
499
+
500
+ QA = safe_sparse_dot (A , Q ) - safe_sparse_dot (c , Q )
501
+ U , s , R = linalg .svd (QA , full_matrices = False )
502
+ V = R .dot (Q .T )
503
+
504
+ if flip_sign :
505
+ U , V = svd_flip (U , V )
506
+
507
+ return U [:, :n_components ], s [:n_components ], V [:n_components , :]
508
+
509
+
363
510
def weighted_mode (a , w , axis = 0 ):
364
511
"""Returns an array of the weighted modal (most common) value in a
365
512
0 commit comments