8000 FIX Ensure sorted bin_edges from kmeans strategy of KBinsDiscretizer … · jnothman/scikit-learn@cdb7f34 · GitHub
[go: up one dir, main page]

Skip to content

Commit cdb7f34

Browse files
SandroCasagrandejnothman
authored andcommitted
FIX Ensure sorted bin_edges from kmeans strategy of KBinsDiscretizer (scikit-learn#13135)
1 parent cad7428 commit cdb7f34

File tree

3 files changed

+17
-5
lines changed

3 files changed

+17
-5
lines changed

doc/whats_new/v0.20.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ Changelog
5555
:mod:`sklearn.preprocessing`
5656
............................
5757

58+
- |Fix| Fixed a bug in :class:`preprocessing.KBinsDiscretizer` where
59+
``strategy='kmeans'`` fails with an error during transformation due to unsorted
60+
bin edges. :issue:`13134` by :user:`Sandro Casagrande <SandroCasagrande>`.
61+
5862
- |Fix| Fixed a bug in :class:`preprocessing.OneHotEncoder` where the
5963
deprecation of ``categorical_features`` was handled incorrectly in
6064
combination with ``handle_unknown='ignore'``.

sklearn/preprocessing/_discretization.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ def fit(self, X, y=None):
176176
# 1D k-means procedure
177177
km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
178178
centers = km.fit(column[:, None]).cluster_centers_[:, 0]
179+
# Must sort, centers may be unsorted even with sorted init
180+
centers.sort()
179181
bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
180182
bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
181183

sklearn/preprocessing/tests/test_discretization.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -187,11 +187,12 @@ def test_invalid_strategy_option():
187187

188188

189189
@pytest.mark.parametrize(
190-
'strategy, expected_2bins, expected_3bins',
191-
[('uniform', [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2]),
192-
('kmeans', [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2]),
193-
('quantile', [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2])])
194-
def test_nonuniform_strategies(strategy, expected_2bins, expected_3bins):
190+
'strategy, expected_2bins, expected_3bins, expected_5bins',
191+
[('uniform', [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
192+
('kmeans', [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
193+
('quantile', [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4])])
194+
def test_nonuniform_strategies(
195+
strategy, expected_2bins, expected_3bins, expected_5bins):
195196
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
196197

197198
# with 2 bins
@@ -204,6 +205,11 @@ def test_nonuniform_strategies(strategy, expected_2bins, expected_3bins):
204205
Xt = est.fit_transform(X)
205206
assert_array_equal(expected_3bins, Xt.ravel())
206207

208+
# with 5 bins
209+
est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode='ordinal')
210+
Xt = est.fit_transform(X)
211+
assert_array_equal(expected_5bins, Xt.ravel())
212+
207213

208214
@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
209215
@pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense'])

0 commit comments

Comments
 (0)
0