8000 Fix IncrementalPCA when final batch is smaller than dimensions requir… · scikit-learn/scikit-learn@1c88b3c · GitHub
[go: up one dir, main page]

Skip to content

Commit 1c88b3c

Browse files
mingglirth
authored andcommitted
Fix IncrementalPCA when final batch is smaller than dimensions required for SVD (#12379)
1 parent c6455fa commit 1c88b3c

File tree

4 files changed

+47
-2
lines changed

4 files changed

+47
-2
lines changed

doc/whats_new/v0.20.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,16 @@ Version 0.20.1
1212
This is a bug-fix release with some minor documentation improvements and
1313
enhancements to features released in 0.20.0.
1414

15+
Changed models
16+
--------------
17+
18+
The following estimators and functions, when fit with the same data and
19+
parameters, may produce different models from the previous version. This often
20+
occurs due to changes in the modelling logic (bug fixes or enhancements), or in
21+
random sampling procedures.
22+
23+
- :class:`decomposition.IncrementalPCA` (bug fix)
24+
1525
Changelog
1626
---------
1727

@@ -55,6 +65,14 @@ Changelog
5565
location in :func:`datasets.fetch_olivetti_faces`. :issue:`12441` by
5666
:user:`Jérémie du Boisberranger <jeremiedbb>`
5767

68+
:mod:`sklearn.decomposition`
69+
............................
70+
71+
- |Fix| Fixed a regression in :class:`decomposition.IncrementalPCA` where
72+
0.20.0 raised an error if the number of samples in the final batch for
73+
fitting IncrementalPCA was smaller than n_components.
74+
:issue:`12234` by :user:`Ming Li <minggli>`.
75+
5876
:mod:`sklearn.ensemble`
5977
.......................
6078

sklearn/decomposition/incremental_pca.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,8 @@ def fit(self, X, y=None):
196196
else:
197197
self.batch_size_ = self.batch_size
198198

199-
for batch in gen_batches(n_samples, self.batch_size_):
199+
for batch in gen_batches(n_samples, self.batch_size_,
200+
min_batch_size=self.n_components or 0):
200201
self.partial_fit(X[batch], check_input=False)
201202

202203
return self

sklearn/decomposition/tests/test_incremental_pca.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from sklearn.utils.testing import assert_array_almost_equal
66
from sklearn.utils.testing import assert_raises
77
from sklearn.utils.testing import assert_raises_regex
8+
from sklearn.utils.testing import assert_allclose_dense_sparse
89

910
from sklearn import datasets
1011
from sklearn.decomposition import PCA, IncrementalPCA
@@ -175,6 +176,23 @@ def test_incremental_pca_batch_values():
175176
assert_almost_equal(i, j, decimal=1)
176177

177178

179+
def test_incremental_pca_batch_rank():
180+
# Test sample size in each batch is always larger or equal to n_components
181+
rng = np.random.RandomState(1999)
182+
n_samples = 100
183+
n_features = 20
184+
X = rng.randn(n_samples, n_features)
185+
all_components = []
186+
batch_sizes = np.arange(20, 90, 3)
187+
for batch_size in batch_sizes:
188+
ipca = IncrementalPCA(n_components=20, batch_size=batch_size).fit(X)
189+
all_components.append(ipca.components_)
190+
191+
for components_i, components_j in zip(all_components[:-1],
192+
all_components[1:]):
193+
assert_allclose_dense_sparse(components_i, components_j)
194+
195+
178196
def test_incremental_pca_partial_fit():
179197
# Test that fit and partial_fit get equivalent results.
180198
rng = np.random.RandomState(1999)

sklearn/utils/__init__.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ def safe_sqr(X, copy=True):
401401
return X
402402

403403

404-
def gen_batches(n, batch_size):
404+
def gen_batches(n, batch_size, min_batch_size=0):
405405
"""Generator to create slices containing batch_size elements, from 0 to n.
406406
407407
The last slice may contain less than batch_size elements, when batch_size
@@ -412,6 +412,8 @@ def gen_batches(n, batch_size):
412412
n : int
413413
batch_size : int
414414
Number of element in each batch
415+
min_batch_size : int, default=0
416+
Minimum batch size to produce.
415417
416418
Yields
417419
------
@@ -426,10 +428,16 @@ def gen_batches(n, batch_size):
426428
[slice(0, 3, None), slice(3, 6, None)]
427429
>>> list(gen_batches(2, 3))
428430
[slice(0, 2, None)]
431+
>>> list(gen_batches(7, 3, min_batch_size=0))
432+
[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
433+
>>> list(gen_batches(7, 3, min_batch_size=2))
434+
[slice(0, 3, None), slice(3, 7, None)]
429435
"""
430436
start = 0
431437
for _ in range(int(n // batch_size)):
432438
end = start + batch_size
439+
if end + min_batch_size > n:
440+
continue
433441
yield slice(start, end)
434442
start = end
435443
if start < n:

0 commit comments

Comments
 (0)
0