8000 FIX Ensure sorted bin_edges from kmeans strategy of KBinsDiscretizer … · xhluca/scikit-learn@7762f1f · GitHub
[go: up one dir, main page]

Skip to content

Commit 7762f1f

Browse files
SandroCasagrandeXing
authored andcommitted
FIX Ensure sorted bin_edges from kmeans strategy of KBinsDiscretizer (scikit-learn#13135)
1 parent d74922c commit 7762f1f

File tree

3 files changed

+17
-5
lines changed

3 files changed

+17
-5
lines changed

doc/whats_new/v0.20.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ Changelog
5555
:mod:`sklearn.preprocessing`
5656
............................
5757

58+
- |Fix| Fixed a bug in :class:`preprocessing.KBinsDiscretizer` where
59+
``strategy='kmeans'`` fails with an error during transformation due to unsorted
60+
bin edges. :issue:`13134` by :user:`Sandro Casagrande <SandroCasagrande>`.
61+
5862
- |Fix| Fixed a bug in :class:`preprocessing.OneHotEncoder` where the
5963
deprecation of ``categorical_features`` was handled incorrectly in
6064
combination with ``handle_unknown='ignore'``.

sklearn/preprocessing/_discretization.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ def fit(self, X, y=None):
172172
# 1D k-means procedure
173173
km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
174174
centers = km.fit(column[:, None]).cluster_centers_[:, 0]
175+
# Must sort, centers may be unsorted even with sorted init
176+
centers.sort()
175177
bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
176178
bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
177179

sklearn/preprocessing/tests/test_discretization.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -185,11 +185,12 @@ def test_invalid_strategy_option():
185185

186186

187187
@pytest.mark.parametrize(
188-
'strategy, expected_2bins, expected_3bins',
189-
[('uniform', [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2]),
190-
('kmeans', [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2]),
191-
('quantile', [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2])])
192-
def test_nonuniform_strategies(strategy, expected_2bins, expected_3bins):
188+
'strategy, expected_2bins, expected_3bins, expected_5bins',
189+
[('uniform', [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
190+
('kmeans', [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
191+
('quantile', [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4])])
192+
def test_nonuniform_strategies(
193+
strategy, expected_2bins, expected_3bins, expected_5bins):
193194
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
194195

195196
# with 2 bins
@@ -202,6 +203,11 @@ def test_nonuniform_strategies(strategy, expected_2bins, expected_3bins):
202203
Xt = est.fit_transform(X)
203204
assert_array_equal(expected_3bins, Xt.ravel())
204205

206+
# with 5 bins
207+
est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode='ordinal')
208+
Xt = est.fit_transform(X)
209+
assert_array_equal(expected_5bins, Xt.ravel())
210+
205211

206212
@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
207213
@pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense'])

0 commit comments

Comments
 (0)
0