8000 ENH allow sparse input to incremental PCA (#13960) · scikit-learn/scikit-learn@df7dd83 · GitHub
[go: up one dir, main page]

Skip to content

Commit df7dd83

Browse files
scottgigantejnothman
authored andcommitted
ENH allow sparse input to incremental PCA (#13960)
1 parent e2a69b7 commit df7dd83

File tree

4 files changed

+113
-15
lines changed

4 files changed

+113
-15
lines changed

doc/modules/decomposition.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ out-of-core Principal Component Analysis either by:
7474
* Using its ``partial_fit`` method on chunks of data fetched sequentially
7575
from the local hard drive or a network database.
7676

77-
* Calling its fit method on a memory mapped file using ``numpy.memmap``.
77+
* Calling its fit method on a sparse matrix or a memory mapped file using
78+
``numpy.memmap``.
7879

7980
:class:`IncrementalPCA` only stores estimates of component and noise variances,
8081
in order update ``explained_variance_ratio_`` incrementally. This is why

doc/whats_new/v0.22.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,14 @@ Changelog
3939
:pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
4040
where 123456 is the *pull request* number, not the issue number.
4141
42+
:mod:`sklearn.decomposition`
43+
..................
44+
45+
- |Enhancement| :class:`decomposition.IncrementalPCA` now accepts sparse
46+
matrices as input, converting them to dense in batches thereby avoiding the
47+
need to store the entire dense matrix at once.
48+
:pr:`13960` by :user:`Scott Gigante <scottgigante>`.
49+
4250
:mod:`sklearn.ensemble`
4351
.......................
4452

sklearn/decomposition/incremental_pca.py

Lines changed: 61 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# License: BSD 3 clause
66

77
import numpy as np
8-
from scipy import linalg
8+
from scipy import linalg, sparse
99

1010
from .base import _BasePCA
1111
from ..utils import check_array, gen_batches
@@ -21,11 +21,13 @@ class IncrementalPCA(_BasePCA):
2121
but not scaled for each feature before applying the SVD.
2222
2323
Depending on the size of the input data, this algorithm can be much more
24-
memory efficient than a PCA.
24+
memory efficient than a PCA, and allows sparse input.
2525
2626
This algorithm has constant memory complexity, on the order
27-
of ``batch_size``, enabling use of np.memmap files without loading the
28-
entire file into memory.
27+
of ``batch_size * n_features``, enabling use of np.memmap files without
28+
loading the entire file into memory. For sparse matrices, the input
29+
is converted to dense in batches (in order to be able to subtract the
30+
mean) which avoids storing the entire dense matrix at any one time.
2931
3032
The computational overhead of each SVD is
3133
``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples
@@ -104,13 +106,15 @@ class IncrementalPCA(_BasePCA):
104106
--------
105107
>>> from sklearn.datasets import load_digits
106108
>>> from sklearn.decomposition import IncrementalPCA
109+
>>> from scipy import sparse
107110
>>> X, _ = load_digits(return_X_y=True)
108111
>>> transformer = IncrementalPCA(n_components=7, batch_size=200)
109112
>>> # either partially fit on smaller batches of data
110113
>>> transformer.partial_fit(X[:100, :])
111114
IncrementalPCA(batch_size=200, n_components=7)
112115
>>> # or let the fit function itself divide the data into batches
113-
>>> X_transformed = transformer.fit_transform(X)
116+
>>> X_sparse = sparse.csr_matrix(X)
117+
>>> X_transformed = transformer.fit_transform(X_sparse)
114118
>>> X_transformed.shape
115119
(1797, 7)
116120
@@ -167,7 +171,7 @@ def fit(self, X, y=None):
167171
168172
Parameters
169173
----------
170-
X : array-like, shape (n_samples, n_features)
174+
X : array-like or sparse matrix, shape (n_samples, n_features)
171175
Training data, where n_samples is the number of samples and
172176
n_features is the number of features.
173177
@@ -188,7 +192,8 @@ def fit(self, X, y=None):
188192
self.singular_values_ = None
189193
self.noise_variance_ = None
190194

191-
X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
195+
X = check_array(X, accept_sparse=['csr', 'csc', 'lil'],
196+
copy=self.copy, dtype=[np.float64, np.float32])
192197
n_samples, n_features = X.shape
193198

194199
if self.batch_size is None:
@@ -198,7 +203,10 @@ def fit(self, X, y=None):
198203

199204
for batch in gen_batches(n_samples, self.batch_size_,
200205
min_batch_size=self.n_components or 0):
201-
self.partial_fit(X[batch], check_input=False)
206+
X_batch = X[batch]
207+
if sparse.issparse(X_batch):
208+
X_batch = X_batch.toarray()
209+
self.partial_fit(X_batch, check_input=False)
202210

203211
return self
204212

@@ -221,6 +229,11 @@ def partial_fit(self, X, y=None, check_input=True):
221229
Returns the instance itself.
222230
"""
223231
if check_input:
232+
if sparse.issparse(X):
233+
raise TypeError(
234+
"IncrementalPCA.partial_fit does not support "
235+
"sparse input. Either convert data to dense "
236+
"or use IncrementalPCA.fit to do so in batches.")
224237
X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
225238
n_samples, n_features = X.shape
226239
if not hasattr(self, 'components_'):
@@ -274,7 +287,7 @@ def partial_fit(self, X, y=None, check_input=True):
274287
np.sqrt((self.n_samples_seen_ * n_samples) /
275288
n_total_samples) * (self.mean_ - col_batch_mean)
276289
X = np.vstack((self.singular_values_.reshape((-1, 1)) *
277-
self.components_, X, mean_correction))
290+
self.components_, X, mean_correction))
278291

279292
U, S, V = linalg.svd(X, full_matrices=False)
280293
U, V = svd_flip(U, V, u_based_decision=False)
@@ -295,3 +308,42 @@ def partial_fit(self, X, y=None, check_input=True):
295308
else:
296309
self.noise_variance_ = 0.
297310
return self
311+
312+
def transform(self, X):
313+
"""Apply dimensionality reduction to X.
314+
315+
X is projected on the first principal components previously extracted
316+
from a training set, using minibatches of size batch_size if X is
317+
sparse.
318+
319+
Parameters
320+
----------
321+
X : array-like, shape (n_samples, n_features)
322+
New data, where n_samples is the number of samples
323+
and n_features is the number of features.
324+
325+
Returns
326+
-------
327+
X_new : array-like, shape (n_samples, n_components)
328+
329+
Examples
330+
--------
331+
332+
>>> import numpy as np
333+
>>> from sklearn.decomposition import IncrementalPCA
334+
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2],
335+
... [1, 1], [2, 1], [3, 2]])
336+
>>> ipca = IncrementalPCA(n_components=2, batch_size=3)
337+
>>> ipca.fit(X)
338+
IncrementalPCA(batch_size=3, n_components=2)
339+
>>> ipca.transform(X) # doctest: +SKIP
340+
"""
341+
if sparse.issparse(X):
342+
n_samples = X.shape[0]
343+
output = []
344+
for batch in gen_batches(n_samples, self.batch_size_,
345+
min_batch_size=self.n_components or 0):
346+
output.append(super().transform(X[batch].toarray()))
347+
return np.vstack(output)
348+
else:
349+
return super().transform(X)

sklearn/decomposition/tests/test_incremental_pca.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Tests for Incremental PCA."""
22
import numpy as np
3+
import pytest
34

45
from sklearn.utils.testing import assert_almost_equal
56
from sklearn.utils.testing import assert_array_almost_equal
@@ -10,6 +11,8 @@
1011
from sklearn import datasets
1112
from sklearn.decomposition import PCA, IncrementalPCA
1213

14+
from scipy import sparse
15+
1316
iris = datasets.load_iris()
1417

1518

@@ -23,17 +26,51 @@ def test_incremental_pca():
2326

2427
X_transformed = ipca.fit_transform(X)
2528

26-
np.testing.assert_equal(X_transformed.shape, (X.shape[0], 2))
27-
assert_almost_equal(ipca.explained_variance_ratio_.sum(),
28-
pca.explained_variance_ratio_.sum(), 1)
29+
assert X_transformed.shape == (X.shape[0], 2)
30+
np.testing.assert_allclose(ipca.explained_variance_ratio_.sum(),
31+
pca.explained_variance_ratio_.sum(), rtol=1e-3)
2932

3033
for n_components in [1, 2, X.shape[1]]:
3134
ipca = IncrementalPCA(n_components, batch_size=batch_size)
3235
ipca.fit(X)
3336
cov = ipca.get_covariance()
3437
precision = ipca.get_precision()
35-
assert_array_almost_equal(np.dot(cov, precision),
36-
np.eye(X.shape[1]))
38+
np.testing.assert_allclose(np.dot(cov, precision),
39+
np.eye(X.shape[1]), atol=1e-13)
40+
41+
42+
@pytest.mark.parametrize(
43+
"matrix_class",
44+
[sparse.csc_matrix, sparse.csr_matrix, sparse.lil_matrix])
45+
def test_incremental_pca_sparse(matrix_class):
46+
# Incremental PCA on sparse arrays.
47+
X = iris.data
48+
pca = PCA(n_components=2)
49+
pca.fit_transform(X)
50+
X_sparse = matrix_class(X)
51+
batch_size = X_sparse.shape[0] // 3
52+
ipca = IncrementalPCA(n_components=2, batch_size=batch_size)
53+
54+
X_transformed = ipca.fit_transform(X_sparse)
55+
56+
assert X_transformed.shape == (X_sparse.shape[0], 2)
57+
np.testing.assert_allclose(ipca.explained_variance_ratio_.sum(),
58+
pca.explained_variance_ratio_.sum(), rtol=1e-3)
59+
60+
for n_components in [1, 2, X.shape[1]]:
61+
ipca = IncrementalPCA(n_components, batch_size=batch_size)
62+
ipca.fit(X_sparse)
63+
cov = ipca.get_covariance()
64+
precision = ipca.get_precision()
65+
np.testing.assert_allclose(np.dot(cov, precision),
66+
np.eye(X_sparse.shape[1]), atol=1e-13)
67+
68+
with pytest.raises(
69+
TypeError,
70+
match="IncrementalPCA.partial_fit does not support "
71+
"sparse input. Either convert data to dense "
72+
"or use IncrementalPCA.fit to do so in batches."):
73+
ipca.partial_fit(X_sparse)
3774

3875

3976
def test_incremental_pca_check_projection():

0 commit comments

Comments
 (0)
0