scikit-learn · glemaitre · Nov 25, 2021 · Nov 21, 2021 · Nov 22, 2021 · Nov 22, 2021
diff --git a/asv_benchmarks/benchmarks/cluster.py b/asv_benchmarks/benchmarks/cluster.py
@@ -11,7 +11,7 @@ class KMeansBenchmark(Predictor, Transformer, Estimator, Benchmark):
     """
 
     param_names = ["representation", "algorithm", "init"]
-    params = (["dense", "sparse"], ["full", "elkan"], ["random", "k-means++"])
+    params = (["dense", "sparse"], ["lloyd", "elkan"], ["random", "k-means++"])
 
     def setup_cache(self):
         super().setup_cache()

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -23,6 +23,17 @@ Put the changes in their relevant module.
 Changed models
 --------------
 
+The following estimators and functions, when fit with the same data and
+parameters, may produce different models from the previous version. This often
+occurs due to changes in the modelling logic (bug fixes or enhancements), or in
+random sampling procedures.
+
+- |Efficiency| :class:`cluster.KMeans` now defaults to ``algorithm="lloyd"``
+  instead of ``algorithm="auto"``, which was equivalent to
+  ``algorithm="elkan"``. Lloyd's algorithm and Elkan's algorithm converge to the
+  same solution, up to numerical rounding errors, but in general Lloyd's
+  algorithm uses much less memory, and it is often faster.
+
 
 Changelog
 ---------
@@ -60,13 +71,24 @@ Changelog
   add this information to the plot.
   :pr:`21038` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+:mod:`sklearn.cluster`
+......................
+
 - |Enhancement| :class:`cluster.SpectralClustering` and :func:`cluster.spectral`
   now include the new `'cluster_qr'` method from :func:`cluster.cluster_qr`
   that clusters samples in the embedding space as an alternative to the existing
   `'kmeans'` and `'discrete'` methods.
   See :func:`cluster.spectral_clustering` for more details.
   :pr:`21148` by :user:`Andrew Knyazev <lobpcg>`
 
+- |Efficiency| In :class:`cluster.KMeans`, the default ``algorithm`` is now
+  ``"lloyd"`` which is the full classical EM-style algorithm. Both ``"auto"``
+  and ``"full"`` are deprecated and will be removed in version 1.3. They are
+  now aliases for ``"lloyd"``. The previous default was ``"auto"``, which relied
+  on Elkan's algorithm. Lloyd's algorithm uses less memory than Elkan's, it
+  is faster on many datasets, and its results are identical, hence the change.
+  :pr:`21735` by :user:`Aurélien Geron <ageron>`.
+
 :mod:`sklearn.cross_decomposition`
 ..................................
 

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
@@ -266,7 +266,7 @@ def k_means(
     tol=1e-4,
     random_state=None,
     copy_x=True,
-    algorithm="auto",
+    algorithm="lloyd",
     return_n_iter=False,
 ):
     """Perform K-means clustering algorithm.
@@ -333,15 +333,22 @@ def k_means(
         `copy_x` is False. If the original data is sparse, but not in CSR format,
         a copy will be made even if `copy_x` is False.
 
-    algorithm : {"auto", "full", "elkan"}, default="auto"
-        K-means algorithm to use. The classical EM-style algorithm is `"full"`.
-        The `"elkan"` variation is more efficient on data with well-defined
-        clusters, by using the triangle inequality. However it's more memory
-        intensive due to the allocation of an extra array of shape
+    algorithm : {"lloyd", "elkan", "auto", "full"}, default="lloyd"
+        K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`.
+        The `"elkan"` variation can be more efficient on some datasets with
+        well-defined clusters, by using the triangle inequality. However it's
+        more memory intensive due to the allocation of an extra array of shape
         `(n_samples, n_clusters)`.
 
-        For now `"auto"` (kept for backward compatibility) chooses `"elkan"` but it
-        might change in the future for a better heuristic.
+        `"auto"` and `"full"` are deprecated and they will be removed in
+        Scikit-Learn 1.3. They are both aliases for `"lloyd"`.
+
+        .. versionchanged:: 0.18
+            Added Elkan algorithm
+
+        .. versionchanged:: 1.1
+            Renamed "full" to "lloyd", and deprecated "auto" and "full".
+            Changed "auto" to use "lloyd" instead of "elkan".
 
     return_n_iter : bool, default=False
         Whether or not to return the number of iterations.
@@ -821,19 +828,23 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         copy_x is False. If the original data is sparse, but not in CSR format,
         a copy will be made even if copy_x is False.
 
-    algorithm : {"auto", "full", "elkan"}, default="auto"
-        K-means algorithm to use. The classical EM-style algorithm is "full".
-        The "elkan" variation is more efficient on data with well-defined
-        clusters, by using the triangle inequality. However it's more memory
-        intensive due to the allocation of an extra array of shape
-        (n_samples, n_clusters).
+    algorithm : {"lloyd", "elkan", "auto", "full"}, default="lloyd"
+        K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`.
+        The `"elkan"` variation can be more efficient on some datasets with
+        well-defined clusters, by using the triangle inequality. However it's
+        more memory intensive due to the allocation of an extra array of shape
+        `(n_samples, n_clusters)`.
 
-        For now "auto" (kept for backward compatibility) chooses "elkan" but it
-        might change in the future for a better heuristic.
+        `"auto"` and `"full"` are deprecated and they will be removed in
+        Scikit-Learn 1.3. They are both aliases for `"lloyd"`.
 
         .. versionchanged:: 0.18
             Added Elkan algorithm
 
+        .. versionchanged:: 1.1
+            Renamed "full" to "lloyd", and deprecated "auto" and "full".
+            Changed "auto" to use "lloyd" instead of "elkan".
+
     Attributes
     ----------
     cluster_centers_ : ndarray of shape (n_clusters, n_features)
@@ -919,7 +930,7 @@ def __init__(
         verbose=0,
         random_state=None,
         copy_x=True,
-        algorithm="auto",
+        algorithm="lloyd",
     ):
 
         self.n_clusters = n_clusters
@@ -952,22 +963,27 @@ def _check_params(self, X):
         self._tol = _tolerance(X, self.tol)
 
         # algorithm
-        if self.algorithm not in ("auto", "full", "elkan"):
+        if self.algorithm not in ("lloyd", "elkan", "auto", "full"):
             raise ValueError(
-                "Algorithm must be 'auto', 'full' or 'elkan', "
+                "Algorithm must be either 'lloyd' or 'elkan', "
                 f"got {self.algorithm} instead."
             )
 
         self._algorithm = self.algorithm
-        if self._algorithm == "auto":
-            self._algorithm = "full" if self.n_clusters == 1 else "elkan"
+        if self._algorithm in ("auto", "full"):
+            warnings.warn(
+                f"algorithm='{self._algorithm}' is deprecated, it will be "
+                "removed in 1.3. Using 'lloyd' instead.",
+                FutureWarning,
+            )
+            self._algorithm = "lloyd"
         if self._algorithm == "elkan" and self.n_clusters == 1:
             warnings.warn(
                 "algorithm='elkan' doesn't make sense for a single "
-                "cluster. Using 'full' instead.",
+                "cluster. Using 'lloyd' instead.",
                 RuntimeWarning,
             )
-            self._algorithm = "full"
+            self._algorithm = "lloyd"
 
         # init
         if not (
@@ -1166,11 +1182,11 @@ def fit(self, X, y=None, sample_weight=None):
         # precompute squared norms of data points
         x_squared_norms = row_norms(X, squared=True)
 
-        if self._algorithm == "full":
+        if self._algorithm == "elkan":
+            kmeans_single = _kmeans_single_elkan
+        else:
             kmeans_single = _kmeans_single_lloyd
             self._check_mkl_vcomp(X, X.shape[0])
-        else:
-            kmeans_single = _kmeans_single_elkan
 
         best_inertia, best_labels = None, None
 

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
@@ -52,7 +52,7 @@
 @pytest.mark.parametrize(
     "array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]
 )
-@pytest.mark.parametrize("algo", ["full", "elkan"])
+@pytest.mark.parametrize("algo", ["lloyd", "elkan"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 def test_kmeans_results(array_constr, algo, dtype):
     # Checks that KMeans works as intended on toy dataset by comparing with
@@ -78,7 +78,7 @@ def test_kmeans_results(array_constr, algo, dtype):
 @pytest.mark.parametrize(
     "array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]
 )
-@pytest.mark.parametrize("algo", ["full", "elkan"])
+@pytest.mark.parametrize("algo", ["lloyd", "elkan"])
 def test_kmeans_relocated_clusters(array_constr, algo):
     # check that empty clusters are relocated as expected
     X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]])
@@ -159,20 +159,20 @@ def test_kmeans_elkan_results(distribution, array_constr, tol):
     X[X < 0] = 0
     X = array_constr(X)
 
-    km_full = KMeans(algorithm="full", n_clusters=5, random_state=0, n_init=1, tol=tol)
+    km_lloyd = KMeans(n_clusters=5, random_state=0, n_init=1, tol=tol)
     km_elkan = KMeans(
         algorithm="elkan", n_clusters=5, random_state=0, n_init=1, tol=tol
     )
 
-    km_full.fit(X)
+    km_lloyd.fit(X)
     km_elkan.fit(X)
-    assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_)
-    assert_array_equal(km_elkan.labels_, km_full.labels_)
-    assert km_elkan.n_iter_ == km_full.n_iter_
-    assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6)
+    assert_allclose(km_elkan.cluster_centers_, km_lloyd.cluster_centers_)
+    assert_array_equal(km_elkan.labels_, km_lloyd.labels_)
+    assert km_elkan.n_iter_ == km_lloyd.n_iter_
+    assert km_elkan.inertia_ == pytest.approx(km_lloyd.inertia_, rel=1e-6)
 
 
-@pytest.mark.parametrize("algorithm", ["full", "elkan"])
+@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"])
 def test_kmeans_convergence(algorithm):
     # Check that KMeans stops when convergence is reached when tol=0. (#16075)
     rnd = np.random.RandomState(0)
@@ -191,6 +191,21 @@ def test_kmeans_convergence(algorithm):
     assert km.n_iter_ < max_iter
 
 
+@pytest.mark.parametrize("algorithm", ["auto", "full"])
+def test_algorithm_auto_full_deprecation_warning(algorithm):
+    X = np
F438
.random.rand(100, 2)
+    kmeans = KMeans(algorithm=algorithm)
+    with pytest.warns(
+        FutureWarning,
+        match=(
+            f"algorithm='{algorithm}' is deprecated, it will "
+            "be removed in 1.3. Using 'lloyd' instead."
+        ),
+    ):
+        kmeans.fit(X)
+        assert kmeans._algorithm == "lloyd"
+
+
 def test_minibatch_update_consistency():
     # Check that dense and sparse minibatch update give the same results
     rng = np.random.RandomState(42)
@@ -326,7 +341,7 @@ def test_fortran_aligned_data(Estimator):
     assert_array_equal(km_c.labels_, km_f.labels_)
 
 
-@pytest.mark.parametrize("algo", ["full", "elkan"])
+@pytest.mark.parametrize("algo", ["lloyd", "elkan"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("constructor", [np.asarray, sp.csr_matrix])
 @pytest.mark.parametrize(
@@ -367,7 +382,7 @@ def test_minibatch_kmeans_verbose():
         sys.stdout = old_stdout
 
 
-@pytest.mark.parametrize("algorithm", ["full", "elkan"])
+@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"])
 @pytest.mark.parametrize("tol", [1e-2, 0])
 def test_kmeans_verbose(algorithm, tol, capsys):
     # Check verbose mode of KMeans for better coverage.
@@ -611,7 +626,7 @@ def test_score_max_iter(Estimator):
 @pytest.mark.parametrize("init", ["random", "k-means++"])
 @pytest.mark.parametrize(
     "Estimator, algorithm",
-    [(KMeans, "full"), (KMeans, "elkan"), (MiniBatchKMeans, None)],
+    [(KMeans, "lloyd"), (KMeans, "elkan"), (MiniBatchKMeans, None)],
 )
 def test_predict(Estimator, algorithm, init, dtype, array_constr):
     # Check the predict method and the equivalence between fit.predict and
@@ -954,7 +969,7 @@ def test_warning_elkan_1_cluster():
 @pytest.mark.parametrize(
     "array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]
 )
-@pytest.mark.parametrize("algo", ["full", "elkan"])
+@pytest.mark.parametrize("algo", ["lloyd", "elkan"])
 def test_k_means_1_iteration(array_constr, algo):
     # check the results after a single iteration (E-step M-step E-step) by
     # comparing against a pure python implementation.
@@ -1088,7 +1103,7 @@ def test_wrong_params(Estimator, param, match):
 
 @pytest.mark.parametrize(
     "param, match",
-    [({"algorithm": "wrong"}, r"Algorithm must be 'auto', 'full' or 'elkan'")],
+    [({"algorithm": "wrong"}, r"Algorithm must be either 'lloyd' or 'elkan'")],
 )
 def test_kmeans_wrong_params(param, match):
     # Check that error are raised with clear error message when wrong values

diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
@@ -284,9 +284,7 @@ def fit(self, X, y=None):
                 init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
 
                 # 1D k-means procedure
-                km = KMeans(
-                    n_clusters=n_bins[jj], init=init, n_init=1, algorithm="full"
-                )
+                km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
                 centers = km.fit(column[:, None]).cluster_centers_[:, 0]
                 # Must sort, centers may be unsorted even with sorted init
                 centers.sort()