8000 [MRG] Fix KMeans convergence when tol==0 by jeremiedbb · Pull Request #17959 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] Fix KMeans convergence when tol==0 #17959

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jul 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ Changelog
:pr:`17995` by :user:`Thomaz Santana <Wikilicious>` and
:user:`Amanda Dsouza <amy12xx>`.

- |Fix| Fixed a bug in :class:`cluster.KMeans` where rounding errors could
prevent convergence to be declared when `tol=0`. :pr:`17959` by
:user:`Jérémie du Boisberranger <jeremiedbb>`.

:mod:`sklearn.covariance`
.........................
Expand Down
51 changes: 36 additions & 15 deletions sklearn/cluster/_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,6 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
Relative tolerance with regards to Frobenius norm of the difference
in the cluster centers of two consecutive iterations to declare
convergence.
It's not advised to set `tol=0` since convergence might never be
declared due to rounding errors. Use a very small number instead.

random_state : int, RandomState instance, default=None
Determines random number generation for centroid initialization. Use
Expand Down Expand Up @@ -358,6 +356,7 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
centers_new = np.zeros_like(centers)
weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
labels = np.full(n_samples, -1, dtype=np.int32)
labels_old = labels.copy()
center_half_distances = euclidean_distances(centers) / 2
distance_next_center = np.partition(np.asarray(center_half_distances),
kth=1, axis=0)[1]
Expand All @@ -377,6 +376,8 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
init_bounds(X, centers, center_half_distances,
labels, upper_bounds, lower_bounds)

strict_convergence = False

for i in range(max_iter):
elkan_iter(X, sample_weight, centers, centers_new,
weight_in_clusters, center_half_distances,
Expand All @@ -395,14 +396,24 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,

centers, centers_new = centers_new, centers

center_shift_tot = (center_shift**2).sum()
if center_shift_tot <= tol:
if np.array_equal(labels, labels_old):
# First check the labels for strict convergence.
if verbose:
print(f"Converged at iteration {i}: center shift "
f"{center_shift_tot} within tolerance {tol}.")
print(f"Converged at iteration {i}: strict convergence.")
strict_convergence = True
break
else:
# No strict convergence, check for tol based convergence.
center_shift_tot = (center_shift**2).sum()
if center_shift_tot <= tol:
if verbose:
print(f"Converged at iteration {i}: center shift "
f"{center_shift_tot} within tolerance {tol}.")
break

if center_shift_tot > 0:
labels_old[:] = labels

if not strict_convergence:
# rerun E-step so that predicted labels match cluster centers
elkan_iter(X, sample_weight, centers, centers, weight_in_clusters,
center_half_distances, distance_next_center,
Expand Down Expand Up @@ -473,6 +484,7 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
centers = centers_init
centers_new = np.zeros_like(centers)
labels = np.full(X.shape[0], -1, dtype=np.int32)
labels_old = labels.copy()
weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
center_shift = np.zeros(n_clusters, dtype=X.dtype)

Expand All @@ -483,6 +495,8 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
lloyd_iter = lloyd_iter_chunked_dense
_inertia = _inertia_dense

strict_convergence = False

# Threadpoolctl context to limit the number of threads in second level of
# nested parallelism (i.e. BLAS) to avoid oversubsciption.
with threadpool_limits(limits=1, user_api="blas"):
Expand All @@ -496,15 +510,24 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,

centers, centers_new = centers_new, centers

center_shift_tot = (center_shift**2).sum()
if center_shift_tot <= tol:
if np.array_equal(labels, labels_old):
# First check the labels for strict convergence.
if verbose:
print("Converged at iteration {0}: "
"center shift {1} within tolerance {2}"
.format(i, center_shift_tot, tol))
print(f"Converged at iteration {i}: strict convergence.")
strict_convergence = True
break
else:
# No strict convergence, check for tol based convergence.
center_shift_tot = (center_shift**2).sum()
if center_shift_tot <= tol:
if verbose:
print(f"Converged at iteration {i}: center shift "
f"{center_shift_tot} within tolerance {tol}.")
break

labels_old[:] = labels

if center_shift_tot > 0:
if not strict_convergence:
# rerun E-step so that predicted labels match cluster centers
lloyd_iter(X, sample_weight, x_squared_norms, centers, centers,
weight_in_clusters, labels, center_shift, n_threads,
Expand Down Expand Up @@ -617,8 +640,6 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
Relative tolerance with regards to Frobenius norm of the difference
in the cluster centers of two consecutive iterations to declare
convergence.
It's not advised to set `tol=0` since convergence might never be
declared due to rounding errors. Use a very small number instead.

precompute_distances : {'auto', True, False}, default='auto'
Precompute distances (faster but takes more memory).
Expand Down
42 changes: 29 additions & 13 deletions sklearn/cluster/tests/test_k_means.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Testing for K-means"""
import re
import sys

import numpy as np
Expand Down Expand Up @@ -136,7 +137,7 @@ def test_relocate_empty_clusters(array_constr):
@pytest.mark.parametrize("distribution", ["normal", "blobs"])
@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
ids=["dense", "sparse"])
@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8])
@pytest.mark.parametrize("tol", [1e-2, 1e-8, 1e-100, 0])
def test_kmeans_elkan_results(distribution, array_constr, tol):
# Check that results are identical between lloyd and elkan algorithms
rnd = np.random.RandomState(0)
Expand All @@ -163,18 +164,14 @@ def test_kmeans_elkan_results(distribution, array_constr, tol):
@pytest.mark.parametrize("algorithm", ["full", "elkan"])
def test_kmeans_convergence(algorithm):
# Check that KMeans stops when convergence is reached when tol=0. (#16075)
# We can only ensure that if the number of threads is not to large,
# otherwise the roundings errors coming from the unpredictability of
# the order in which chunks are processed make the convergence criterion
# to never be exactly 0.
rnd = np.random.RandomState(0)
X = rnd.normal(size=(5000, 10))
max_iter = 300

with threadpool_limits(limits=1, user_api="openmp"):
km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0,
n_init=1, tol=0, max_iter=300).fit(X)
km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0,
n_init=1, tol=0, max_iter=max_iter).fit(X)

assert km.n_iter_ < 300
assert km.n_iter_ < max_iter


def test_minibatch_update_consistency():
Expand Down Expand Up @@ -339,10 +336,9 @@ def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
assert v_measure_score(labels_1, labels_2) == 1


@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
def test_verbose(Estimator):
# Check verbose mode of KMeans and MiniBatchKMeans for better coverage.
km = Estimator(n_clusters=n_clusters, random_state=42, verbose=1)
def test_minibatch_kmeans_verbose():
# Check verbose mode of MiniBatchKMeans for better coverage.
km = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, verbose=1)
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
Expand All @@ -351,6 +347,26 @@ def test_verbose(Estimator):
sys.stdout = old_stdout


@pytest.mark.parametrize("algorithm", ["full", "elkan"])
@pytest.mark.parametrize("tol", [1e-2, 0])
def test_kmeans_verbose(algorithm, tol, capsys):
# Check verbose mode of KMeans for better coverage.
X = np.random.RandomState(0).normal(size=(5000, 10))

KMeans(algorithm=algorithm, n_clusters=n_clusters, random_state=42,
init="random", n_init=1, tol=tol, verbose=1).fit(X)

captured = capsys.readouterr()

assert re.search(r"Initialization complete", captured.out)
assert re.search(r"Iteration [0-9]+, inertia", captured.out)

if tol == 0:
assert re.search(r"strict convergence", captured.out)
else:
assert re.search(r"center shift .* within tolerance", captured.out)


def test_minibatch_kmeans_warning_init_size():
# Check that a warning is raised when init_size is smaller than n_clusters
with pytest.warns(RuntimeWarning,
Expand Down
0