5
5
# License: BSD 3 clause
6
6
7
7
import numpy as np
8
- from scipy import linalg
8
+ from scipy import linalg , sparse
9
9
10
10
from .base import _BasePCA
11
11
from ..utils import check_array , gen_batches
@@ -21,11 +21,13 @@ class IncrementalPCA(_BasePCA):
21
21
but not scaled for each feature before applying the SVD.
22
22
23
23
Depending on the size of the input data, this algorithm can be much more
24
- memory efficient than a PCA.
24
+ memory efficient than a PCA, and allows sparse input .
25
25
26
26
This algorithm has constant memory complexity, on the order
27
- of ``batch_size``, enabling use of np.memmap files without loading the
28
- entire file into memory.
27
+ of ``batch_size * n_features``, enabling use of np.memmap files without
28
+ loading the entire file into memory. For sparse matrices, the input
29
+ is converted to dense in batches (in order to be able to subtract the
30
+ mean) which avoids storing the entire dense matrix at any one time.
29
31
30
32
The computational overhead of each SVD is
31
33
``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples
@@ -104,13 +106,15 @@ class IncrementalPCA(_BasePCA):
104
106
--------
105
107
>>> from sklearn.datasets import load_digits
106
108
>>> from sklearn.decomposition import IncrementalPCA
109
+ >>> from scipy import sparse
107
110
>>> X, _ = load_digits(return_X_y=True)
108
111
>>> transformer = IncrementalPCA(n_components=7, batch_size=200)
109
112
>>> # either partially fit on smaller batches of data
110
113
>>> transformer.partial_fit(X[:100, :])
111
114
IncrementalPCA(batch_size=200, n_components=7)
112
115
>>> # or let the fit function itself divide the data into batches
113
- >>> X_transformed = transformer.fit_transform(X)
116
+ >>> X_sparse = sparse.csr_matrix(X)
117
+ >>> X_transformed = transformer.fit_transform(X_sparse)
114
118
>>> X_transformed.shape
115
119
(1797, 7)
116
120
@@ -167,7 +171,7 @@ def fit(self, X, y=None):
167
171
168
172
Parameters
169
173
----------
170
- X : array-like, shape (n_samples, n_features)
174
+ X : array-like or sparse matrix , shape (n_samples, n_features)
171
175
Training data, where n_samples is the number of samples and
172
176
n_features is the number of features.
173
177
@@ -188,7 +192,8 @@ def fit(self, X, y=None):
188
192
self .singular_values_ = None
189
193
self .noise_variance_ = None
190
194
191
- X = check_array (X , copy = self .copy , dtype = [np .float64 , np .float32 ])
195
+ X = check_array (X , accept_sparse = ['csr' , 'csc' , 'lil' ],
196
+ copy = self .copy , dtype = [np .float64 , np .float32 ])
192
197
n_samples , n_features = X .shape
193
198
194
199
if self .batch_size is None :
@@ -198,7 +203,10 @@ def fit(self, X, y=None):
198
203
199
204
for batch in gen_batches (n_samples , self .batch_size_ ,
200
205
min_batch_size = self .n_components or 0 ):
201
- self .partial_fit (X [batch ], check_input = False )
206
+ X_batch = X [batch ]
207
+ if sparse .issparse (X_batch ):
208
+ X_batch = X_batch .toarray ()
209
+ self .partial_fit (X_batch , check_input = False )
202
210
203
211
return self
204
212
@@ -221,6 +229,11 @@ def partial_fit(self, X, y=None, check_input=True):
221
229
Returns the instance itself.
222
230
"""
223
231
if check_input :
232
+ if sparse .issparse (X ):
233
+ raise TypeError (
234
+ "IncrementalPCA.partial_fit does not support "
235
+ "sparse input. Either convert data to dense "
236
+ "or use IncrementalPCA.fit to do so in batches." )
224
237
X = check_array (X , copy = self .copy , dtype = [np .float64 , np .float32 ])
225
238
n_samples , n_features = X .shape
226
239
if not hasattr (self , 'components_' ):
@@ -274,7 +287,7 @@ def partial_fit(self, X, y=None, check_input=True):
274
287
np .sqrt ((self .n_samples_seen_ * n_samples ) /
275
288
n_total_samples ) * (self .mean_ - col_batch_mean )
276
289
X = np .vstack ((self .singular_values_ .reshape ((- 1 , 1 )) *
277
- self .components_ , X , mean_correction ))
290
+ self .components_ , X , mean_correction ))
278
291
279
292
U , S , V = linalg .svd (X , full_matrices = False )
280
293
U , V = svd_flip (U , V , u_based_decision = False )
@@ -295,3 +308,42 @@ def partial_fit(self, X, y=None, check_input=True):
295
308
else :
296
309
self .noise_variance_ = 0.
297
310
return self
311
+
312
+ def transform (self , X ):
313
+ """Apply dimensionality reduction to X.
314
+
315
+ X is projected on the first principal components previously extracted
316
+ from a training set, using minibatches of size batch_size if X is
317
+ sparse.
318
+
319
+ Parameters
320
+ ----------
321
+ X : array-like, shape (n_samples, n_features)
322
+ New data, where n_samples is the number of samples
323
+ and n_features is the number of features.
324
+
325
+ Returns
326
+ -------
327
+ X_new : array-like, shape (n_samples, n_components)
328
+
329
+ Examples
330
+ --------
331
+
332
+ >>> import numpy as np
333
+ >>> from sklearn.decomposition import IncrementalPCA
334
+ >>> X = np.array([[-1, -1], [-2, -1], [-3, -2],
335
+ ... [1, 1], [2, 1], [3, 2]])
336
+ >>> ipca = IncrementalPCA(n_components=2, batch_size=3)
337
+ >>> ipca.fit(X)
338
+ IncrementalPCA(batch_size=3, n_components=2)
339
+ >>> ipca.transform(X) # doctest: +SKIP
340
+ """
341
+ if sparse .issparse (X ):
342
+ n_samples = X .shape [0 ]
343
+ output = []
344
+ for batch in gen_batches (n_samples , self .batch_size_ ,
345
+ min_batch_size = self .n_components or 0 ):
346
+ output .append (super ().transform (X [batch ].toarray ()))
347
+ return np .vstack (output )
348
+ else :
349
+ return super ().transform (X )
0 commit comments