@@ -147,6 +147,7 @@ def safe_sparse_dot(a, b, dense_output=False):
147
147
148
148
def randomized_range_finder (A , size , n_iter ,
149
149
power_iteration_normalizer = 'auto' ,
150
+ subtract_mean = False ,
150
151
random_state = None ):
151
152
"""Computes an orthonormal matrix whose range approximates the range of A.
152
153
@@ -171,6 +172,13 @@ def randomized_range_finder(A, size, n_iter,
171
172
172
173
.. versionadded:: 0.18
173
174
175
+ subtract_mean : bool
176
+ Whether the mean of `A` should be subtracted after each multiplication
177
+ by the `A` matrix. This is equivalent to multiplying matrices by a
178
+ centered `A` without ever having to explicitly center. This is
179
+ especially useful for performing PCA on large sparse matrices, so they
180
+ do not need to be centered.
181
+
174
182
random_state : int, RandomState instance or None, optional (default=None)
175
183
The seed of the pseudo random number generator to use when shuffling
176
184
the data. If int, random_state is the seed used by the random number
@@ -211,28 +219,39 @@ def randomized_range_finder(A, size, n_iter,
211
219
else :
212
220
power_iteration_normalizer = 'LU'
213
221
222
+ if subtract_mean :
223
+ c = np .mean (A , axis = 0 ).reshape ((1 , - 1 ))
224
+ applyA = lambda X : safe_sparse_dot (A , X ) - safe_sparse_dot (c , X )
225
+ applyAT = lambda X : safe_sparse_dot (A .T , X ) - \
226
+ safe_sparse_dot (c .T , Q .sum (axis = 0 ).reshape ((1 , - 1 )))
227
+ else :
228
+ applyA = lambda X : safe_sparse_dot (A , X )
229
+ applyAT = lambda X : safe_sparse_dot (A .T , X )
230
+
231
+ Q = applyA (Q )
232
+
214
233
# Perform power iterations with Q to further 'imprint' the top
215
234
# singular vectors of A in Q
216
235
for i in range (n_iter ):
217
236
if power_iteration_normalizer == 'none' :
218
- Q = safe_sparse_dot ( A , Q )
219
- Q = safe_sparse_dot ( A . T , Q )
237
+ Q = applyAT ( Q )
238
+ Q = applyA ( Q )
220
239
elif power_iteration_normalizer == 'LU' :
221
- Q , _ = linalg .lu (safe_sparse_dot ( A , Q ), permute_l = True )
222
- Q , _ = linalg .lu (safe_sparse_dot ( A . T , Q ), permute_l = True )
240
+ Q , _ = linalg .lu (applyAT ( Q ), permute_l = True )
241
+ Q , _ = linalg .lu (applyA ( Q ), permute_l = True )
223
242
elif power_iteration_normalizer == 'QR' :
224
- Q , _ = linalg .qr (safe_sparse_dot ( A , Q ), mode = 'economic' )
225
- Q , _ = linalg .qr (safe_sparse_dot ( A . T , Q ), mode = 'economic' )
243
+ Q , _ = linalg .qr (applyAT ( Q ), mode = 'economic' )
244
+ Q , _ = linalg .qr (applyA ( Q ), mode = 'economic' )
226
245
227
246
# Sample the range of A using by linear projection of Q
228
247
# Extract an orthonormal basis
229
- Q , _ = linalg .qr (safe_sparse_dot ( A , Q ) , mode = 'economic' )
248
+ Q , _ = linalg .qr (Q , mode = 'economic' )
230
249
return Q
231
250
232
251
233
252
def randomized_svd (M , n_components , n_oversamples = 10 , n_iter = 'auto' ,
234
253
power_iteration_normalizer = 'auto' , transpose = 'auto' ,
235
- flip_sign = True , random_state = 0 ):
254
+ flip_sign = True , subtract_mean = False , random_state = 0 ):
236
255
"""Computes a truncated randomized SVD
237
256
238
257
Parameters
@@ -283,6 +302,13 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
283
302
set to `True`, the sign ambiguity is resolved by making the largest
284
303
loadings for each component in the left singular vectors positive.
285
304
305
+ subtract_mean : bool
306
+ Whether the mean of `A` should be subtracted after each multiplication
307
+ by the `A` matrix. This is equivalent to multiplying matrices by a
308
+ centered `A` without ever having to explicitly center. This is
309
+ especially useful for performing PCA on large sparse matrices, so they
310
+ do not need to be centered.
311
+
286
312
random_state : int, RandomState instance or None, optional (default=None)
287
313
The seed of the pseudo random number generator to use when shuffling
288
314
the data. If int, random_state is the seed used by the random number
@@ -333,11 +359,20 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
333
359
# this implementation is a bit faster with smaller shape[1]
334
360
M = M .T
335
361
336
- Q = randomized_range_finder (M , n_random , n_iter ,
337
- power_iteration_normalizer , random_state )
362
+ Q = randomized_range_finder (
363
+ M ,
364
+ size = n_random ,
365
+ n_iter = n_iter ,
366
+ power_iteration_normalizer = power_iteration_normalizer ,
367
+ subtract_mean = subtract_mean ,
368
+ random_state = random_state ,
369
+ )
338
370
339
371
# project M to the (k + p) dimensional space using the basis vectors
340
372
B = safe_sparse_dot (Q .T , M )
373
+ if subtract_mean :
374
+ c = M .mean (axis = 0 ).reshape ((1 , - 1 ))
375
+ B -= np .dot (c .T , Q .sum (axis = 0 ).reshape ((1 , - 1 ))).T
341
376
342
377
# compute the SVD on the thin matrix: (k + p) wide
343
378
Uhat , s , V = linalg .svd (B , full_matrices = False )
0 commit comments