8000 MNT Don't normalize sample weights in KMeans by jeremiedbb · Pull Request #17848 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

MNT Don't normalize sample weights in KMeans #17848

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ parameters, may produce different models from the previous version. This often
occurs due to changes in the modelling logic (bug fixes or enhancements), or in
random sampling procedures.

- items
- items
- |Fix| ``inertia_`` attribute of :class:`cluster.KMeans` and
:class:`cluster.MiniBatchKMeans`.

Details are listed in the changelog below.

Expand Down Expand Up @@ -53,6 +53,14 @@ Changelog
sparse matrix or dataframe at the start. :pr:`17546` by
:user:`Lucy Liu <lucyleeow>`.

:mod:`sklearn.cluster`
.........................

- |Fix| Fixed a bug in :class:`cluster.KMeans` and
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the inertia will change, we should add this change in the Changed Model section.

:class:`cluster.MiniBatchKMeans` where the reported inertia was incorrectly
weighted by the sample weights. :pr:`17848` by
:user:`Jérémie du Boisberranger <jeremiedbb>`.

:mod:`sklearn.datasets`
.......................

Expand Down
27 changes: 6 additions & 21 deletions sklearn/cluster/_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,21 +167,6 @@ def _tolerance(X, tol):
return np.mean(variances) * tol


def _check_normalize_sample_weight(sample_weight, X):
"""Set sample_weight if None, and check for correct dtype"""

sample_weight_was_none = sample_weight is None

sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
if not sample_weight_was_none:
# normalize the weights to sum up to n_samples
# an array of 1 (i.e. samples_weight is None) is already normalized
n_samples = len(sample_weight)
scale = n_samples / sample_weight.sum()
sample_weight = sample_weight * scale
return sample_weight


@_deprecate_positional_args
def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
precompute_distances='deprecated', n_init=10, max_iter=300,
Expand Down Expand Up @@ -399,7 +384,7 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
Number of iterations run.
"""
random_state = check_random_state(random_state)
sample_weight = _check_normalize_sample_weight(sample_weight, X)
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

# init
centers = _init_centroids(X, n_clusters, init, random_state=random_state,
Expand Down Expand Up @@ -546,7 +531,7 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
Number of iterations run.
"""
random_state = check_random_state(random_state)
sample_weight = _check_normalize_sample_weight(sample_weight, X)
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

# init
centers = _init_centroids(X, n_clusters, init, random_state=random_state,
Expand Down Expand Up @@ -639,7 +624,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,

n_threads = _openmp_effective_n_threads(n_threads)

sample_weight = _check_normalize_sample_weight(sample_weight, X)
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
labels = np.full(n_samples, -1, dtype=np.int32)
weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype)
center_shift = np.zeros_like(weight_in_clusters)
Expand Down Expand Up @@ -1620,7 +1605,7 @@ def fit(self, X, y=None, sample_weight=None):
raise ValueError("n_samples=%d should be >= n_clusters=%d"
% (n_samples, self.n_clusters))

sample_weight = _check_normalize_sample_weight(sample_weight, X)
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

n_init = self.n_init
if hasattr(self.init, '__array__'):
Expand Down Expand Up @@ -1769,7 +1754,7 @@ def _labels_inertia_minibatch(self, X, sample_weight):
"""
if self.verbose:
print('Computing label assignment and total inertia')
sample_weight = _check_normalize_sample_weight(sample_weight, X)
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
x_squared_norms = row_norms(X, squared=True)
slices = gen_batches(X.shape[0], self.batch_size)
results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s],
Expand Down Expand Up @@ -1807,7 +1792,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
if n_samples == 0:
return self

sample_weight = _check_normalize_sample_weight(sample_weight, X)
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

x_squared_norms = row_norms(X, squared=True)
self.random_state_ = getattr(self, "random_state_",
Expand Down
27 changes: 9 additions & 18 deletions sklearn/cluster/tests/test_k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from sklearn.utils._testing import assert_warns_message
from sklearn.utils._testing import assert_raise_message
from sklearn.utils.fixes import _astype_copy_false
from sklearn.utils.validation import _num_samples
from sklearn.base import clone
from sklearn.exceptions import ConvergenceWarning

Expand Down Expand Up @@ -50,27 +49,28 @@
X_csr = sp.csr_matrix(X)


@pytest.mark.parametrize("representation", ["dense", "sparse"])
@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
ids=["dense", "sparse"])
@pytest.mark.parametrize("algo", ["full", "elkan"])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_kmeans_results(representation, algo, dtype):
# cheks that kmeans works as intended
array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation]
def test_kmeans_results(array_constr, algo, dtype):
# Checks that KMeans works as intended on toy dataset by comparing with
# expected results computed by hand.
X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5]
sample_weight = [3, 1, 1, 3]
init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)

expected_labels = [0, 0, 1, 1]
expected_inertia = 0.1875
expected_inertia = 0.375
expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
expected_n_iter = 2

kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
kmeans.fit(X, sample_weight=sample_weight)

assert_array_equal(kmeans.labels_, expected_labels)
assert_almost_equal(kmeans.inertia_, expected_inertia)
assert_array_almost_equal(kmeans.cluster_centers_, expected_centers)
assert_allclose(kmeans.inertia_, expected_inertia)
assert_allclose(kmeans.cluster_centers_, expected_centers)
assert kmeans.n_iter_ == expected_n_iter


Expand Down Expand Up @@ -993,15 +993,6 @@ def test_sample_weight_length():
km.fit(X, sample_weight=np.ones(2))


def test_check_normalize_sample_weight():
from sklearn.cluster._kmeans import _check_normalize_sample_weight
sample_weight = None
checked_sample_weight = _check_normalize_sample_weight(sample_weight, X)
assert _num_samples(X) == _num_samples(checked_sample_weight)
assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
assert X.dtype == checked_sample_weight.dtype


def test_iter_attribute():
# Regression test on bad n_iter_ value. Previous bug n_iter_ was one off
# it's right value (#11340).
Expand Down
0