diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 211f1e4049d65..2f53a9d7fee69 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -44,7 +44,13 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. +:mod:`sklearn.cluster` +...................... +- |Efficiency| The "k-means++" initialization of :class:`cluster.KMeans` and + :class:`cluster.MiniBatchKMeans` is now faster, especially in multicore + settings. :pr:`19002` by :user:`Jon Crall ` and + :user:`Jérémie du Boisberranger `. Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index f23df27dc8ad5..26effbcdaabac 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -20,6 +20,7 @@ from ..base import BaseEstimator, ClusterMixin, TransformerMixin from ..metrics.pairwise import euclidean_distances +from ..metrics.pairwise import _euclidean_distances from ..utils.extmath import row_norms, stable_cumsum from ..utils.sparsefuncs_fast import assign_rows_csr from ..utils.sparsefuncs import mean_variance_axis @@ -103,7 +104,7 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms, indices[0] = center_id # Initialize list of closest distances and calculate current potential - closest_dist_sq = euclidean_distances( + closest_dist_sq = _euclidean_distances( centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True) current_pot = closest_dist_sq.sum() @@ -120,7 +121,7 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms, out=candidate_ids) # Compute distances to center candidates - distance_to_candidates = euclidean_distances( + distance_to_candidates = _euclidean_distances( X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True) # update closest distances squared and potential for each candidate diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 5263b593e9594..a5a879af34678 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -230,7 +230,8 @@ def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False, Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \ default=None - Y_norm_squared : array-like of shape (n_samples_Y,), default=None + Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) \ + or (1, n_samples_Y), default=None Pre-computed dot-products of vectors in Y (e.g., ``(Y**2).sum(axis=1)``) May be ignored in some cases, see the note below. @@ -238,7 +239,8 @@ def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False, squared : bool, default=False Return squared Euclidean distances. - X_norm_squared : array-like of shape (n_samples,), default=None + X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) \ + or (1, n_samples_X), default=None Pre-computed dot-products of vectors in X (e.g., ``(X**2).sum(axis=1)``) May be ignored in some cases, see the note below. @@ -271,38 +273,65 @@ def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False, """ X, Y = check_pairwise_arrays(X, Y) - # If norms are passed as float32, they are unused. If arrays are passed as - # float32, norms needs to be recomputed on upcast chunks. - # TODO: use a float64 accumulator in row_norms to avoid the latter. if X_norm_squared is not None: - XX = check_array(X_norm_squared) - if XX.shape == (1, X.shape[0]): - XX = XX.T - elif XX.shape != (X.shape[0], 1): + X_norm_squared = check_array(X_norm_squared, ensure_2d=False) + original_shape = X_norm_squared.shape + if X_norm_squared.shape == (X.shape[0],): + X_norm_squared = X_norm_squared.reshape(-1, 1) + if X_norm_squared.shape == (1, X.shape[0]): + X_norm_squared = X_norm_squared.T + if X_norm_squared.shape != (X.shape[0], 1): raise ValueError( - "Incompatible dimensions for X and X_norm_squared") - if XX.dtype == np.float32: + f"Incompatible dimensions for X of shape {X.shape} and " + f"X_norm_squared of shape {original_shape}.") + + if Y_norm_squared is not None: + Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False) + original_shape = Y_norm_squared.shape + if Y_norm_squared.shape == (Y.shape[0],): + Y_norm_squared = Y_norm_squared.reshape(1, -1) + if Y_norm_squared.shape == (Y.shape[0], 1): + Y_norm_squared = Y_norm_squared.T + if Y_norm_squared.shape != (1, Y.shape[0]): + raise ValueError( + f"Incompatible dimensions for Y of shape {Y.shape} and " + f"Y_norm_squared of shape {original_shape}.") + + return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared) + + +def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, + squared=False): + """Computational part of euclidean_distances + + Assumes inputs are already checked. + + If norms are passed as float32, they are unused. If arrays are passed as + float32, norms needs to be recomputed on upcast chunks. + TODO: use a float64 accumulator in row_norms to avoid the latter. + """ + if X_norm_squared is not None: + if X_norm_squared.dtype == np.float32: XX = None + else: + XX = X_norm_squared.reshape(-1, 1) elif X.dtype == np.float32: XX = None else: XX = row_norms(X, squared=True)[:, np.newaxis] - if X is Y and XX is not None: - # shortcut in the common case euclidean_distances(X, X) - YY = XX.T - elif Y_norm_squared is not None: - YY = np.atleast_2d(Y_norm_squared) - - if YY.shape != (1, Y.shape[0]): - raise ValueError( - "Incompatible dimensions for Y and Y_norm_squared") - if YY.dtype == np.float32: - YY = None - elif Y.dtype == np.float32: - YY = None + if Y is X: + YY = None if XX is None else XX.T else: - YY = row_norms(Y, squared=True)[np.newaxis, :] + if Y_norm_squared is not None: + if Y_norm_squared.dtype == np.float32: + YY = None + else: + YY = Y_norm_squared.reshape(1, -1) + elif Y.dtype == np.float32: + YY = None + else: + YY = row_norms(Y, squared=True)[np.newaxis, :] if X.dtype == np.float32: # To minimize precision issues with float32, we compute the distance diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 92dea4e791dfe..58bb64c5e8fa7 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -660,6 +660,34 @@ def test_euclidean_distances_with_norms(dtype, y_array_constr): assert_allclose(wrong_D, D1) +def test_euclidean_distances_norm_shapes(): + # Check all accepted shapes for the norms or appropriate error messages. + rng = np.random.RandomState(0) + X = rng.random_sample((10, 10)) + Y = rng.random_sample((20, 10)) + + X_norm_squared = (X ** 2).sum(axis=1) + Y_norm_squared = (Y ** 2).sum(axis=1) + + D1 = euclidean_distances(X, Y, + X_norm_squared=X_norm_squared, + Y_norm_squared=Y_norm_squared) + D2 = euclidean_distances(X, Y, + X_norm_squared=X_norm_squared.reshape(-1, 1), + Y_norm_squared=Y_norm_squared.reshape(-1, 1)) + D3 = euclidean_distances(X, Y, + X_norm_squared=X_norm_squared.reshape(1, -1), + Y_norm_squared=Y_norm_squared.reshape(1, -1)) + + assert_allclose(D2, D1) + assert_allclose(D3, D1) + + with pytest.raises(ValueError, match="Incompatible dimensions for X"): + euclidean_distances(X, Y, X_norm_squared=X_norm_squared[:5]) + with pytest.raises(ValueError, match="Incompatible dimensions for Y"): + euclidean_distances(X, Y, Y_norm_squared=Y_norm_squared[:5]) + + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"])