From 8f804a749e55b94312279605c41bee396a60486e Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 6 Jul 2020 16:16:05 +0200 Subject: [PATCH 1/5] remove normalize_sample_weight --- sklearn/cluster/_kmeans.py | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 225a359126ad3..842c44721da1c 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -167,21 +167,6 @@ def _tolerance(X, tol): return np.mean(variances) * tol -def _check_normalize_sample_weight(sample_weight, X): - """Set sample_weight if None, and check for correct dtype""" - - sample_weight_was_none = sample_weight is None - - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - if not sample_weight_was_none: - # normalize the weights to sum up to n_samples - # an array of 1 (i.e. samples_weight is None) is already normalized - n_samples = len(sample_weight) - scale = n_samples / sample_weight.sum() - sample_weight = sample_weight * scale - return sample_weight - - @_deprecate_positional_args def k_means(X, n_clusters, *, sample_weight=None, init='k-means++', precompute_distances='deprecated', n_init=10, max_iter=300, @@ -399,7 +384,7 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300, Number of iterations run. """ random_state = check_random_state(random_state) - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, @@ -546,7 +531,7 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, Number of iterations run. """ random_state = check_random_state(random_state) - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, @@ -639,7 +624,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads = _openmp_effective_n_threads(n_threads) - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) labels = np.full(n_samples, -1, dtype=np.int32) weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype) center_shift = np.zeros_like(weight_in_clusters) @@ -1620,7 +1605,7 @@ def fit(self, X, y=None, sample_weight=None): raise ValueError("n_samples=%d should be >= n_clusters=%d" % (n_samples, self.n_clusters)) - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) n_init = self.n_init if hasattr(self.init, '__array__'): @@ -1769,7 +1754,7 @@ def _labels_inertia_minibatch(self, X, sample_weight): """ if self.verbose: print('Computing label assignment and total inertia') - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) x_squared_norms = row_norms(X, squared=True) slices = gen_batches(X.shape[0], self.batch_size) results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s], @@ -1807,7 +1792,7 @@ def partial_fit(self, X, y=None, sample_weight=None): if n_samples == 0: return self - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) x_squared_norms = row_norms(X, squared=True) self.random_state_ = getattr(self, "random_state_", From d619f679dc5f766d14e0bf1df63545fecac4532a Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 6 Jul 2020 16:23:56 +0200 Subject: [PATCH 2/5] update tests --- sklearn/cluster/tests/test_k_means.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 97d6322d798cc..a8e8d83c222e0 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -15,7 +15,6 @@ from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import assert_raise_message from sklearn.utils.fixes import _astype_copy_false -from sklearn.utils.validation import _num_samples from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning @@ -50,18 +49,19 @@ X_csr = sp.csr_matrix(X) -@pytest.mark.parametrize("representation", ["dense", "sparse"]) +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) @pytest.mark.parametrize("algo", ["full", "elkan"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_kmeans_results(representation, algo, dtype): - # cheks that kmeans works as intended - array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation] +def test_kmeans_results(array_constr, algo, dtype): + # Checks that KMeans works as intended on toy dataset by comparing with + # expected results computed by hand. X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) - sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] + sample_weight = [3, 1, 1, 3] init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) expected_labels = [0, 0, 1, 1] - expected_inertia = 0.1875 + expected_inertia = 0.375 expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) expected_n_iter = 2 @@ -69,8 +69,8 @@ def test_kmeans_results(representation, algo, dtype): kmeans.fit(X, sample_weight=sample_weight) assert_array_equal(kmeans.labels_, expected_labels) - assert_almost_equal(kmeans.inertia_, expected_inertia) - assert_array_almost_equal(kmeans.cluster_centers_, expected_centers) + assert_allclose(kmeans.inertia_, expected_inertia) + assert_allclose(kmeans.cluster_centers_, expected_centers) assert kmeans.n_iter_ == expected_n_iter @@ -993,15 +993,6 @@ def test_sample_weight_length(): km.fit(X, sample_weight=np.ones(2)) -def test_check_normalize_sample_weight(): - from sklearn.cluster._kmeans import _check_normalize_sample_weight - sample_weight = None - checked_sample_weight = _check_normalize_sample_weight(sample_weight, X) - assert _num_samples(X) == _num_samples(checked_sample_weight) - assert_almost_equal(checked_sample_weight.sum(), _num_samples(X)) - assert X.dtype == checked_sample_weight.dtype - - def test_iter_attribute(): # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off # it's right value (#11340). From 792d2d05c62bc696dbd07ff9d661f92044188622 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 6 Jul 2020 16:37:46 +0200 Subject: [PATCH 3/5] what's new --- doc/whats_new/v0.24.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 9e31762d62c29..d5eb537cf838e 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -53,6 +53,14 @@ Changelog sparse matrix or dataframe at the start. :pr:`17546` by :user:`Lucy Liu `. +:mod:`sklearn.cluster` +......................... + +- |Fix| Fixed a bug in :class:`cluster.KMeans` and + :class:`cluster.MiniBatchKMeans` where the reported inertia was incorrectly + weighted by the sample weights. :pr:`17848` by + :user:`Jérémie du Boisberranger `. + :mod:`sklearn.datasets` ....................... From 3af2afcf95c55b25b2af49e00d0f3eb6390bb2ca Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 8 Jul 2020 10:36:54 +0200 Subject: [PATCH 4/5] changed models --- doc/whats_new/v0.24.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index d5eb537cf838e..977c9c7fe429b 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -22,8 +22,7 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. -- items -- items +- |Fix| :class:`cluster.KMeans` and :class:`cluster.MiniBatchKMeans`. Details are listed in the changelog below. From ae2374c6867eeedc1218c59d87964cc06e391f49 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 8 Jul 2020 10:38:44 +0200 Subject: [PATCH 5/5] changed models --- doc/whats_new/v0.24.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 977c9c7fe429b..9e95e021a20a7 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -22,7 +22,8 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. -- |Fix| :class:`cluster.KMeans` and :class:`cluster.MiniBatchKMeans`. +- |Fix| ``inertia_`` attribute of :class:`cluster.KMeans` and + :class:`cluster.MiniBatchKMeans`. Details are listed in the changelog below.