diff --git a/asv_benchmarks/benchmarks/cluster.py b/asv_benchmarks/benchmarks/cluster.py index 09aa2818ad486..5973947a7d295 100644 --- a/asv_benchmarks/benchmarks/cluster.py +++ b/asv_benchmarks/benchmarks/cluster.py @@ -11,7 +11,7 @@ class KMeansBenchmark(Predictor, Transformer, Estimator, Benchmark): """ param_names = ["representation", "algorithm", "init"] - params = (["dense", "sparse"], ["full", "elkan"], ["random", "k-means++"]) + params = (["dense", "sparse"], ["lloyd", "elkan"], ["random", "k-means++"]) def setup_cache(self): super().setup_cache() diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index cc607aedb3946..d3a4430773c3b 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -23,6 +23,17 @@ Put the changes in their relevant module. Changed models -------------- +The following estimators and functions, when fit with the same data and +parameters, may produce different models from the previous version. This often +occurs due to changes in the modelling logic (bug fixes or enhancements), or in +random sampling procedures. + +- |Efficiency| :class:`cluster.KMeans` now defaults to ``algorithm="lloyd"`` + instead of ``algorithm="auto"``, which was equivalent to + ``algorithm="elkan"``. Lloyd's algorithm and Elkan's algorithm converge to the + same solution, up to numerical rounding errors, but in general Lloyd's + algorithm uses much less memory, and it is often faster. + Changelog --------- @@ -60,6 +71,9 @@ Changelog add this information to the plot. :pr:`21038` by :user:`Guillaume Lemaitre `. +:mod:`sklearn.cluster` +...................... + - |Enhancement| :class:`cluster.SpectralClustering` and :func:`cluster.spectral` now include the new `'cluster_qr'` method from :func:`cluster.cluster_qr` that clusters samples in the embedding space as an alternative to the existing @@ -67,6 +81,14 @@ Changelog See :func:`cluster.spectral_clustering` for more details. :pr:`21148` by :user:`Andrew Knyazev ` +- |Efficiency| In :class:`cluster.KMeans`, the default ``algorithm`` is now + ``"lloyd"`` which is the full classical EM-style algorithm. Both ``"auto"`` + and ``"full"`` are deprecated and will be removed in version 1.3. They are + now aliases for ``"lloyd"``. The previous default was ``"auto"``, which relied + on Elkan's algorithm. Lloyd's algorithm uses less memory than Elkan's, it + is faster on many datasets, and its results are identical, hence the change. + :pr:`21735` by :user:`Aurélien Geron `. + :mod:`sklearn.cross_decomposition` .................................. diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index b76c78411f9db..3112343e05f5f 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -266,7 +266,7 @@ def k_means( tol=1e-4, random_state=None, copy_x=True, - algorithm="auto", + algorithm="lloyd", return_n_iter=False, ): """Perform K-means clustering algorithm. @@ -333,15 +333,22 @@ def k_means( `copy_x` is False. If the original data is sparse, but not in CSR format, a copy will be made even if `copy_x` is False. - algorithm : {"auto", "full", "elkan"}, default="auto" - K-means algorithm to use. The classical EM-style algorithm is `"full"`. - The `"elkan"` variation is more efficient on data with well-defined - clusters, by using the triangle inequality. However it's more memory - intensive due to the allocation of an extra array of shape + algorithm : {"lloyd", "elkan", "auto", "full"}, default="lloyd" + K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`. + The `"elkan"` variation can be more efficient on some datasets with + well-defined clusters, by using the triangle inequality. However it's + more memory intensive due to the allocation of an extra array of shape `(n_samples, n_clusters)`. - For now `"auto"` (kept for backward compatibility) chooses `"elkan"` but it - might change in the future for a better heuristic. + `"auto"` and `"full"` are deprecated and they will be removed in + Scikit-Learn 1.3. They are both aliases for `"lloyd"`. + + .. versionchanged:: 0.18 + Added Elkan algorithm + + .. versionchanged:: 1.1 + Renamed "full" to "lloyd", and deprecated "auto" and "full". + Changed "auto" to use "lloyd" instead of "elkan". return_n_iter : bool, default=False Whether or not to return the number of iterations. @@ -821,19 +828,23 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): copy_x is False. If the original data is sparse, but not in CSR format, a copy will be made even if copy_x is False. - algorithm : {"auto", "full", "elkan"}, default="auto" - K-means algorithm to use. The classical EM-style algorithm is "full". - The "elkan" variation is more efficient on data with well-defined - clusters, by using the triangle inequality. However it's more memory - intensive due to the allocation of an extra array of shape - (n_samples, n_clusters). + algorithm : {"lloyd", "elkan", "auto", "full"}, default="lloyd" + K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`. + The `"elkan"` variation can be more efficient on some datasets with + well-defined clusters, by using the triangle inequality. However it's + more memory intensive due to the allocation of an extra array of shape + `(n_samples, n_clusters)`. - For now "auto" (kept for backward compatibility) chooses "elkan" but it - might change in the future for a better heuristic. + `"auto"` and `"full"` are deprecated and they will be removed in + Scikit-Learn 1.3. They are both aliases for `"lloyd"`. .. versionchanged:: 0.18 Added Elkan algorithm + .. versionchanged:: 1.1 + Renamed "full" to "lloyd", and deprecated "auto" and "full". + Changed "auto" to use "lloyd" instead of "elkan". + Attributes ---------- cluster_centers_ : ndarray of shape (n_clusters, n_features) @@ -919,7 +930,7 @@ def __init__( verbose=0, random_state=None, copy_x=True, - algorithm="auto", + algorithm="lloyd", ): self.n_clusters = n_clusters @@ -952,22 +963,27 @@ def _check_params(self, X): self._tol = _tolerance(X, self.tol) # algorithm - if self.algorithm not in ("auto", "full", "elkan"): + if self.algorithm not in ("lloyd", "elkan", "auto", "full"): raise ValueError( - "Algorithm must be 'auto', 'full' or 'elkan', " + "Algorithm must be either 'lloyd' or 'elkan', " f"got {self.algorithm} instead." ) self._algorithm = self.algorithm - if self._algorithm == "auto": - self._algorithm = "full" if self.n_clusters == 1 else "elkan" + if self._algorithm in ("auto", "full"): + warnings.warn( + f"algorithm='{self._algorithm}' is deprecated, it will be " + "removed in 1.3. Using 'lloyd' instead.", + FutureWarning, + ) + self._algorithm = "lloyd" if self._algorithm == "elkan" and self.n_clusters == 1: warnings.warn( "algorithm='elkan' doesn't make sense for a single " - "cluster. Using 'full' instead.", + "cluster. Using 'lloyd' instead.", RuntimeWarning, ) - self._algorithm = "full" + self._algorithm = "lloyd" # init if not ( @@ -1166,11 +1182,11 @@ def fit(self, X, y=None, sample_weight=None): # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) - if self._algorithm == "full": + if self._algorithm == "elkan": + kmeans_single = _kmeans_single_elkan + else: kmeans_single = _kmeans_single_lloyd self._check_mkl_vcomp(X, X.shape[0]) - else: - kmeans_single = _kmeans_single_elkan best_inertia, best_labels = None, None diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 94050edb54431..6e395778418c8 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -52,7 +52,7 @@ @pytest.mark.parametrize( "array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"] ) -@pytest.mark.parametrize("algo", ["full", "elkan"]) +@pytest.mark.parametrize("algo", ["lloyd", "elkan"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_kmeans_results(array_constr, algo, dtype): # Checks that KMeans works as intended on toy dataset by comparing with @@ -78,7 +78,7 @@ def test_kmeans_results(array_constr, algo, dtype): @pytest.mark.parametrize( "array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"] ) -@pytest.mark.parametrize("algo", ["full", "elkan"]) +@pytest.mark.parametrize("algo", ["lloyd", "elkan"]) def test_kmeans_relocated_clusters(array_constr, algo): # check that empty clusters are relocated as expected X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) @@ -159,20 +159,20 @@ def test_kmeans_elkan_results(distribution, array_constr, tol): X[X < 0] = 0 X = array_constr(X) - km_full = KMeans(algorithm="full", n_clusters=5, random_state=0, n_init=1, tol=tol) + km_lloyd = KMeans(n_clusters=5, random_state=0, n_init=1, tol=tol) km_elkan = KMeans( algorithm="elkan", n_clusters=5, random_state=0, n_init=1, tol=tol ) - km_full.fit(X) + km_lloyd.fit(X) km_elkan.fit(X) - assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) - assert_array_equal(km_elkan.labels_, km_full.labels_) - assert km_elkan.n_iter_ == km_full.n_iter_ - assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6) + assert_allclose(km_elkan.cluster_centers_, km_lloyd.cluster_centers_) + assert_array_equal(km_elkan.labels_, km_lloyd.labels_) + assert km_elkan.n_iter_ == km_lloyd.n_iter_ + assert km_elkan.inertia_ == pytest.approx(km_lloyd.inertia_, rel=1e-6) -@pytest.mark.parametrize("algorithm", ["full", "elkan"]) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) def test_kmeans_convergence(algorithm): # Check that KMeans stops when convergence is reached when tol=0. (#16075) rnd = np.random.RandomState(0) @@ -191,6 +191,21 @@ def test_kmeans_convergence(algorithm): assert km.n_iter_ < max_iter +@pytest.mark.parametrize("algorithm", ["auto", "full"]) +def test_algorithm_auto_full_deprecation_warning(algorithm): + X = np.random.rand(100, 2) + kmeans = KMeans(algorithm=algorithm) + with pytest.warns( + FutureWarning, + match=( + f"algorithm='{algorithm}' is deprecated, it will " + "be removed in 1.3. Using 'lloyd' instead." + ), + ): + kmeans.fit(X) + assert kmeans._algorithm == "lloyd" + + def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) @@ -326,7 +341,7 @@ def test_fortran_aligned_data(Estimator): assert_array_equal(km_c.labels_, km_f.labels_) -@pytest.mark.parametrize("algo", ["full", "elkan"]) +@pytest.mark.parametrize("algo", ["lloyd", "elkan"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("constructor", [np.asarray, sp.csr_matrix]) @pytest.mark.parametrize( @@ -367,7 +382,7 @@ def test_minibatch_kmeans_verbose(): sys.stdout = old_stdout -@pytest.mark.parametrize("algorithm", ["full", "elkan"]) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) @pytest.mark.parametrize("tol", [1e-2, 0]) def test_kmeans_verbose(algorithm, tol, capsys): # Check verbose mode of KMeans for better coverage. @@ -611,7 +626,7 @@ def test_score_max_iter(Estimator): @pytest.mark.parametrize("init", ["random", "k-means++"]) @pytest.mark.parametrize( "Estimator, algorithm", - [(KMeans, "full"), (KMeans, "elkan"), (MiniBatchKMeans, None)], + [(KMeans, "lloyd"), (KMeans, "elkan"), (MiniBatchKMeans, None)], ) def test_predict(Estimator, algorithm, init, dtype, array_constr): # Check the predict method and the equivalence between fit.predict and @@ -954,7 +969,7 @@ def test_warning_elkan_1_cluster(): @pytest.mark.parametrize( "array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"] ) -@pytest.mark.parametrize("algo", ["full", "elkan"]) +@pytest.mark.parametrize("algo", ["lloyd", "elkan"]) def test_k_means_1_iteration(array_constr, algo): # check the results after a single iteration (E-step M-step E-step) by # comparing against a pure python implementation. @@ -1088,7 +1103,7 @@ def test_wrong_params(Estimator, param, match): @pytest.mark.parametrize( "param, match", - [({"algorithm": "wrong"}, r"Algorithm must be 'auto', 'full' or 'elkan'")], + [({"algorithm": "wrong"}, r"Algorithm must be either 'lloyd' or 'elkan'")], ) def test_kmeans_wrong_params(param, match): # Check that error are raised with clear error message when wrong values diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 9fd9ff9409092..7ced77f99b5af 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -284,9 +284,7 @@ def fit(self, X, y=None): init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 # 1D k-means procedure - km = KMeans( - n_clusters=n_bins[jj], init=init, n_init=1, algorithm="full" - ) + km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1) centers = km.fit(column[:, None]).cluster_centers_[:, 0] # Must sort, centers may be unsorted even with sorted init centers.sort()