scikit-learn · thomasjpfan · Dec 20, 2020 · Dec 14, 2020 · Dec 14, 2020 · Dec 14, 2020
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -44,7 +44,13 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
+:mod:`sklearn.cluster`
+......................
 
+- |Efficiency| The "k-means++" initialization of :class:`cluster.KMeans` and
+  :class:`cluster.MiniBatchKMeans` is now faster, especially in multicore
+  settings. :pr:`19002` by :user:`Jon Crall <Erotemic>` and
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
 Code and Documentation Contributors
 -----------------------------------

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
@@ -20,6 +20,7 @@
 
 from ..base import BaseEstimator, ClusterMixin, TransformerMixin
 from ..metrics.pairwise import euclidean_distances
+from ..metrics.pairwise import _euclidean_distances
 from ..utils.extmath import row_norms, stable_cumsum
 from ..utils.sparsefuncs_fast import assign_rows_csr
 from ..utils.sparsefuncs import mean_variance_axis
@@ -103,7 +104,7 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms,
     indices[0] = center_id
 
     # Initialize list of closest distances and calculate current potential
-    closest_dist_sq = euclidean_distances(
+    closest_dist_sq = _euclidean_distances(
         centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms,
         squared=True)
     current_pot = closest_dist_sq.sum()
@@ -120,7 +121,7 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms,
                 out=candidate_ids)
 
         # Compute distances to center candidates
-        distance_to_candidates = euclidean_distances(
+        distance_to_candidates = _euclidean_distances(
             X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)
 
         # update closest distances squared and potential for each candidate

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
@@ -230,15 +230,17 @@ def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False,
     Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \
             default=None
 
-    Y_norm_squared : array-like of shape (n_samples_Y,), default=None
+    Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) \
+            or (1, n_samples_Y), default=None
         Pre-computed dot-products of vectors in Y (e.g.,
         ``(Y**2).sum(axis=1)``)
         May be ignored in some cases, see the note below.
 
     squared : bool, default=False
         Return squared Euclidean distances.
 
-    X_norm_squared : array-like of shape (n_samples,), default=None
+    X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) \
+            or (1, n_samples_X), default=None
         Pre-computed dot-products of vectors in X (e.g.,
         ``(X**2).sum(axis=1)``)
         May be ignored in some cases, see the note below.
@@ -271,38 +273,65 @@ def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False,
     """
     X, Y = check_pairwise_arrays(X, Y)
 
-    # If norms are passed as float32, they are unused. If arrays are passed as
-    # float32, norms needs to be recomputed on upcast chunks.
-    # TODO: use a float64 accumulator in row_norms to avoid the latter.
     if X_norm_squared is not None:
-        XX = check_array(X_norm_squared)
-        if XX.shape == (1, X.shape[0]):
-            XX = XX.T
-        elif XX.shape != (X.shape[0], 1):
+        X_norm_squared = check_array(X_norm_squared, ensure_2d=False)
+        original_shape = X_norm_squared.shape
+        if X_norm_squared.shape == (X.shape[0],):
+            X_norm_squared = X_norm_squared.reshape(-1, 1)
+        if X_norm_squared.shape == (1, X.shape[0]):
+            X_norm_squared = X_norm_squared.T
+        if X_norm_squared.shape != (X.shape[0], 1):
             raise ValueError(
-                "Incompatible dimensions for X and X_norm_squared")
-        if XX.dtype == np.float32:
+                f"Incompatible dimensions for X of shape {X.shape} and "
+                f"X_norm_squared of shape {original_shape}.")
+
+    if Y_norm_squared is not None:
+        Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False)
+        original_shape = Y_norm_squared.shape
+        if Y_norm_squared.shape == (Y.shape[0],):
+            Y_norm_squared = Y_norm_squared.reshape(1, -1)
+        if Y_norm_squared.shape == (Y.shape[0], 1):
+            Y_norm_squared = Y_norm_squared.T
+        if Y_norm_squared.shape != (1, Y.shape[0]):
+            raise ValueError(
+                f"Incompatible dimensions for Y of shape {Y.shape} and "
+                f"Y_norm_squared of shape {original_shape}.")
+
+    return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared)
+
+
+def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None,
+                         squared=False):
+    """Computational part of euclidean_distances
+
+    Assumes inputs are already checked.
+
+    If norms are passed as float32, they are unused. If arrays are passed as
+    float32, norms needs to be recomputed on upcast chunks.
+    TODO: use a float64 accumulator in row_norms to avoid the latter.
+    """
+    if X_norm_squared is not None:
+        if X_norm_squared.dtype == np.float32:
             XX = None
+        else:
+            XX = X_norm_squared.reshape(-1, 1)
     elif X.dtype == np.float32:
         XX = None
     else:
         XX = row_norms(X, squared=True)[:, np.newaxis]
 
-    if X is Y and XX is not None:
-        # shortcut in the common case euclidean_distances(X, X)
-        YY = XX.T
-    elif Y_norm_squared is not None:
-        YY = np.atleast_2d(Y_norm_squared)
-
-        if YY.shape != (1, Y.shape[0]):
-            raise ValueError(
-                "Incompatible dimensions for Y and Y_norm_squared")
-        if YY.dtype == np.float32:
-            YY = None
-    elif Y.dtype == np.float32:
-        YY = None
+    if Y is X:
+        YY = None if XX is None else XX.T
     else:
-        YY = row_norms(Y, squared=True)[np.newaxis, :]
+        if Y_norm_squared is not None:
+            if Y_norm_squared.dtype == np.float32:
+                YY = None
+            else:
+                YY = Y_norm_squared.reshape(1, -1)
+        elif Y.dtype == np.float32:
+            YY = None
+        else:
+            YY = row_norms(Y, squared=True)[np.newaxis, :]
 
     if X.dtype == np.float32:
         # To minimize precision issues with float32, we compute the distance

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
@@ -660,6 +660,34 @@ def test_euclidean_distances_with_norms(dtype, y_array_constr):
         assert_allclose(wrong_D, D1)
 
 
+def test_euclidean_distances_norm_shapes():
+    # Check all accepted shapes for the norms or appropriate error messages.
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((10, 10))
+    Y = rng.random_sample((20, 10))
+
+    X_norm_squared = (X ** 2).sum(axis=1)
+    Y_norm_squared = (Y ** 2).sum(axis=1)
+
+    D1 = euclidean_distances(X, Y,
+                             X_norm_squared=X_norm_squared,
+                             Y_norm_squared=Y_norm_squared)
+    D2 = euclidean_distances(X, Y,
+                             X_norm_squared=X_norm_squared.reshape(-1, 1),
+                             Y_norm_squared=Y_norm_squared.reshape(-1, 1))
+    D3 = euclidean_distances(X, Y,
+                             X_norm_squared=X_norm_squared.reshape(1, -1),
+                             Y_norm_squared=Y_norm_squared.reshape(1, -1))
+
+    assert_allclose(D2, D1)
+    assert_allclose(D3, D1)
+
+    with pytest.raises(ValueError, match="Incompatible dimensions for X"):
+        euclidean_distances(X, Y, X_norm_squared=X_norm_squared[:5])
+    with pytest.raises(ValueError, match="Incompatible dimensions for Y"):
+        euclidean_distances(X, Y, Y_norm_squared=Y_norm_squared[:5])
+
+
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("x_array_constr", [np.array, csr_matrix],
                          ids=["dense", "sparse"])