10000 ENH Avoid repeated input checks in kmeans++ by jeremiedbb · Pull Request #19002 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH Avoid repeated input checks in kmeans++ #19002

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Dec 20, 2020
6 changes: 6 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,13 @@ Changelog
:pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
where 123456 is the *pull request* number, not the issue number.

:mod:`sklearn.cluster`
......................

- |Efficiency| The "k-means++" initialization of :class:`cluster.KMeans` and
:class:`cluster.MiniBatchKMeans` is now faster, especially in multicore
settings. :pr:`19002` by :user:`Jon Crall <Erotemic>` and
:user:`Jérémie du Boisberranger <jeremiedbb>`.

Code and Documentation Contributors
-----------------------------------
Expand Down
5 changes: 3 additions & 2 deletions sklearn/cluster/_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from ..base import BaseEstimator, ClusterMixin, TransformerMixin
from ..metrics.pairwise import euclidean_distances
from ..metrics.pairwise import _euclidean_distances
from ..utils.extmath import row_norms, stable_cumsum
from ..utils.sparsefuncs_fast import assign_rows_csr
from ..utils.sparsefuncs import mean_variance_axis
Expand Down Expand Up @@ -103,7 +104,7 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms,
indices[0] = center_id

# Initialize list of closest distances and calculate current potential
closest_dist_sq = euclidean_distances(
closest_dist_sq = _euclidean_distances(
centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms,
squared=True)
current_pot = closest_dist_sq.sum()
Expand All @@ -120,7 +121,7 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms,
out=candidate_ids)

# Compute distances to center candidates
distance_to_candidates = euclidean_distances(
distance_to_candidates = _euclidean_distances(
X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)

# update closest distances squared and potential for each candidate
Expand Down
79 changes: 54 additions & 25 deletions sklearn/metrics/pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,15 +230,17 @@ def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False,
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \
default=None

Y_norm_squared : array-like of shape (n_samples_Y,), default=None
Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) \
or (1, n_samples_Y), default=None
Pre-computed dot-products of vectors in Y (e.g.,
``(Y**2).sum(axis=1)``)
May be ignored in some cases, see the note below.

squared : bool, default=False
Return squared Euclidean distances.

X_norm_squared : array-like of shape (n_samples,), default=None
X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) \
or (1, n_samples_X), default=None
Pre-computed dot-products of vectors in X (e.g.,
``(X**2).sum(axis=1)``)
May be ignored in some cases, see the note below.
Expand Down Expand Up @@ -271,38 +273,65 @@ def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False,
"""
X, Y = check_pairwise_arrays(X, Y)

# If norms are passed as float32, they are unused. If arrays are passed as
# float32, norms needs to be recomputed on upcast chunks.
# TODO: use a float64 accumulator in row_norms to avoid the latter.
if X_norm_squared is not None:
XX = check_array(X_norm_squared)
if XX.shape == (1, X.shape[0]):
XX = XX.T
elif XX.shape != (X.shape[0], 1):
X_norm_squared = check_array(X_norm_squared, ensure_2d=False)
original_shape = X_norm_squared.shape
if X_norm_squared.shape == (X.shape[0],):
X_norm_squared = X_norm_squared.reshape(-1, 1)
if X_norm_squared.shape == (1, X.shape[0]):
X_norm_squared = X_norm_squared.T
if X_norm_squared.shape != (X.shape[0], 1):
raise ValueError(
"Incompatible dimensions for X and X_norm_squared")
if XX.dtype == np.float32:
f"Incompatible dimensions for X of shape {X.shape} and "
f"X_norm_squared of shape {original_shape}.")

if Y_norm_squared is not None:
Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False)
original_shape = Y_norm_squared.shape
if Y_norm_squared.shape == (Y.shape[0],):
Y_norm_squared = Y_norm_squared.reshape(1, -1)
if Y_norm_squared.shape == (Y.shape[0], 1):
Y_norm_squared = Y_norm_squared.T
if Y_norm_squared.shape != (1, Y.shape[0]):
raise ValueError(
f"Incompatible dimensions for Y of shape {Y.shape} and "
f"Y_norm_squared of shape {original_shape}.")

return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared)


def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None,
squared=False):
"""Computational part of euclidean_distances

Assumes inputs are already checked.

If norms are passed as float32, they are unused. If arrays are passed as
float32, norms needs to be recomputed on upcast chunks.
TODO: use a float64 accumulator in row_norms to avoid the latter.
"""
if X_norm_squared is not None:
if X_norm_squared.dtype == np.float32:
XX = None
else:
XX = X_norm_squared.reshape(-1, 1)
elif X.dtype == np.float32:
XX = None
else:
XX = row_norms(X, squared=True)[:, np.newaxis]

if X is Y and XX is not None:
# shortcut in the common case euclidean_distances(X, X)
YY = XX.T
elif Y_norm_squared is not None:
YY = np.atleast_2d(Y_norm_squared)

if YY.shape != (1, Y.shape[0]):
raise ValueError(
"Incompatible dimensions for Y and Y_norm_squared")
if YY.dtype == np.float32:
YY = None
elif Y.dtype == np.float32:
YY = None
if Y is X:
YY = None if XX is None else XX.T
else:
YY = row_norms(Y, squared=True)[np.newaxis, :]
if Y_norm_squared is not None:
if Y_norm_squared.dtype == np.float32:
YY = None
else:
YY = Y_norm_squared.reshape(1, -1)
elif Y.dtype == np.float32:
YY = None
else:
YY = row_norms(Y, squared=True)[np.newaxis, :]

if X.dtype == np.float32:
# To minimize precision issues with float32, we compute the distance
Expand Down
28 changes: 28 additions & 0 deletions sklearn/metrics/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,34 @@ def test_euclidean_distances_with_norms(dtype, y_array_constr):
assert_allclose(wrong_D, D1)


def test_euclidean_distances_norm_shapes():
# Check all accepted shapes for the norms or appropriate error messages.
rng = np.random.RandomState(0)
X = rng.random_sample((10, 10))
Y = rng.random_sample((20, 10))

X_norm_squared = (X ** 2).sum(axis=1)
Y_norm_squared = (Y ** 2).sum(axis=1)

D1 = euclidean_distances(X, Y,
X_norm_squared=X_norm_squared,
Y_norm_squared=Y_norm_squared)
D2 = euclidean_distances(X, Y,
X_norm_squared=X_norm_squared.reshape(-1, 1),
Y_norm_squared=Y_norm_squared.reshape(-1, 1))
D3 = euclidean_distances(X, Y,
X_norm_squared=X_norm_squared.reshape(1, -1),
Y_norm_squared=Y_norm_squared.reshape(1, -1))

assert_allclose(D2, D1)
assert_allclose(D3, D1)

with pytest.raises(ValueError, match="Incompatible dimensions for X"):
euclidean_distances(X, Y, X_norm_squared=X_norm_squared[:5])
with pytest.raises(ValueError, match="Incompatible dimensions for Y"):
euclidean_distances(X, Y, Y_norm_squared=Y_norm_squared[:5])


@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("x_array_constr", [np.array, csr_matrix],
ids=["dense", "sparse"])
Expand Down
0