From e19921450fe0fe0c77dcc9baf9b8e9fc8923e9e6 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 24 Feb 2020 18:15:16 +0100
Subject: [PATCH 01/72] refactoring

---
 sklearn/cluster/_kmeans.py            | 595 ++++++++++++--------------
 sklearn/cluster/tests/test_k_means.py |   6 +-
 2 files changed, 287 insertions(+), 314 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 7e4df5908137b..c36acf122445e 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -22,7 +22,6 @@
 from ..utils.extmath import row_norms, stable_cumsum
 from ..utils.sparsefuncs_fast import assign_rows_csr
 from ..utils.sparsefuncs import mean_variance_axis
-from ..utils.validation import _num_samples
 from ..utils import check_array
 from ..utils import gen_batches
 from ..utils import check_random_state
@@ -43,8 +42,8 @@
 ###############################################################################
 # Initialization heuristic
 
-
-def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
+def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state,
+                     n_local_trials=None):
     """Init n_clusters seeds according to k-means++
 
     Parameters
@@ -83,8 +82,6 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
 
     centers = np.empty((n_clusters, n_features), dtype=X.dtype)
 
-    assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
-
     # Set the number of local seeding trials if none is given
     if n_local_trials is None:
         # This is what Arthur/Vassilvitskii tried, but did not report
@@ -143,30 +140,6 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
 ###############################################################################
 # K-means batch estimation by EM (expectation maximization)
 
-def _validate_center_shape(X, n_centers, centers):
-    """Check if centers is compatible with X and n_centers"""
-    if len(centers) != n_centers:
-        raise ValueError('The shape of the initial centers (%s) '
-                         'does not match the number of clusters %i'
-                         % (centers.shape, n_centers))
-    if centers.shape[1] != X.shape[1]:
-        raise ValueError(
-            "The number of features of the initial centers %s "
-            "does not match the number of features of the data %s."
-            % (centers.shape[1], X.shape[1]))
-
-
-def _tolerance(X, tol):
-    """Return a tolerance which is independent of the dataset"""
-    if tol == 0:
-        return 0
-    if sp.issparse(X):
-        variances = mean_variance_axis(X, axis=0)[1]
-    else:
-        variances = np.var(X, axis=0)
-    return np.mean(variances) * tol
-
-
 def _check_normalize_sample_weight(sample_weight, X):
     """Set sample_weight if None, and check for correct dtype"""
 
@@ -322,8 +295,8 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++',
         return est.cluster_centers_, est.labels_, est.inertia_
 
 
-def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
-                         init='k-means++', verbose=False, x_squared_norms=None,
+def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
+                         verbose=False, x_squared_norms=None,
                          random_state=None, tol=1e-4, n_threads=1):
     """A single run of k-means lloyd, assumes preparation completed prior.
 
@@ -335,29 +308,12 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
     sample_weight : array-like of shape (n_samples,)
         The weights for each observation in X.
 
-    n_clusters : int
-        The number of clusters to form as well as the number of
-        centroids to generate.
+    centers_init : ndarray of shape (n_clusters, n_features)
+        The initial centers.
 
     max_iter : int, default=300
         Maximum number of iterations of the k-means algorithm to run.
 
-    init : {'k-means++', 'random', ndarray, callable}, default='k-means++'
-        Method for initialization:
-
-        'k-means++' : selects initial cluster centers for k-mean
-        clustering in a smart way to speed up convergence. See section
-        Notes in k_init for more details.
-
-        'random': choose `n_clusters` observations (rows) at random from data
-        for the initial centroids.
-
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
-        and gives the initial centers.
-
-        If a callable is passed, it should take arguments X, n_clusters and a
-        random state and return an initialization.
-
     verbose : bool, default=False
         Verbosity mode
 
@@ -398,17 +354,12 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
         Number of iterations run.
     """
     random_state = check_random_state(random_state)
-    sample_weight = _check_normalize_sample_weight(sample_weight, X)
-
-    # init
-    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
-                              x_squared_norms=x_squared_norms)
-
-    if verbose:
-        print('Initialization complete')
 
     n_samples = X.shape[0]
+    n_clusters = centers_init.shape[0]
 
+    # Buffers to avoid new allocations at each iteration.
+    centers = centers_init
     centers_new = np.zeros_like(centers)
     weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
     labels = np.full(n_samples, -1, dtype=np.int32)
@@ -444,18 +395,17 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
 
         if verbose:
             inertia = _inertia(X, sample_weight, centers, labels)
-            print("Iteration {0}, inertia {1}" .format(i, inertia))
+            print(f"Iteration {i}, inertia {inertia}")
+
+        centers, centers_new = centers_new, centers
 
         center_shift_tot = (center_shift**2).sum()
         if center_shift_tot <= tol:
             if verbose:
-                print("Converged at iteration {0}: "
-                      "center shift {1} within tolerance {2}"
-                      .format(i, center_shift_tot, tol))
+                print(f"Converged at iteration {i}: center shift "
+                      f"{center_shift_tot} within tolerance {tol}.")
             break
 
-        centers, centers_new = centers_new, centers
-
     if center_shift_tot > 0:
         # rerun E-step so that predicted labels match cluster centers
         elkan_iter(X, sample_weight, centers, centers, weight_in_clusters,
@@ -468,8 +418,8 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
     return labels, inertia, centers, i + 1
 
 
-def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
-                         init='k-means++', verbose=False, x_squared_norms=None,
+def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
+                         verbose=False, x_squared_norms=None,
                          random_state=None, tol=1e-4, n_threads=1):
     """A single run of k-means lloyd, assumes preparation completed prior.
 
@@ -481,29 +431,12 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
     sample_weight : ndarray of shape (n_samples,)
         The weights for each observation in X.
 
-    n_clusters : int
-        The number of clusters to form as well as the number of
-        centroids to generate.
+    centers_init : ndarray of shape (n_clusters, n_features)
+        The initial centers.
 
     max_iter : int, default=300
         Maximum number of iterations of the k-means algorithm to run.
 
-    init : {'k-means++', 'random', ndarray, callable}, default='k-means++'
-        Method for initialization:
-
-        'k-means++' : selects initial cluster centers for k-mean
-        clustering in a smart way to speed up convergence. See section
-        Notes in k_init for more details.
-
-        'random': choose `n_clusters` observations (rows) at random from data
-        for the initial centroids.
-
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
-        and gives the initial centers.
-
-        If a callable is passed, it should take arguments X, n_clusters and a
-        random state and return an initialization.
-
     verbose : bool, default=False
         Verbosity mode
 
@@ -544,15 +477,11 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
         Number of iterations run.
     """
     random_state = check_random_state(random_state)
-    sample_weight = _check_normalize_sample_weight(sample_weight, X)
 
-    # init
-    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
-                              x_squared_norms=x_squared_norms)
-
-    if verbose:
-        print("Initialization complete")
+    n_clusters = centers_init.shape[0]
 
+    # Buffers to avoid new allocations at each iteration.
+    centers = centers_init
     centers_new = np.zeros_like(centers)
     labels = np.full(X.shape[0], -1, dtype=np.int32)
     weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
@@ -571,18 +500,17 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
 
         if verbose:
             inertia = _inertia(X, sample_weight, centers, labels)
-            print("Iteration {0}, inertia {1}" .format(i, inertia))
+            print(f"Iteration {i}, inertia {inertia}.")
+
+        centers, centers_new = centers_new, centers
 
         center_shift_tot = (center_shift**2).sum()
         if center_shift_tot <= tol:
             if verbose:
-                print("Converged at iteration {0}: "
-                      "center shift {1} within tolerance {2}"
-                      .format(i, center_shift_tot, tol))
+                print(f"Converged at iteration {i}: center shift "
+                      f"{center_shift_tot} within tolerance {tol}.")
             break
 
-        centers, centers_new = centers_new, centers
-
     if center_shift_tot > 0:
         # rerun E-step so that predicted labels match cluster centers
         lloyd_iter(X, sample_weight, x_squared_norms, centers, centers,
@@ -594,28 +522,29 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
     return labels, inertia, centers, i + 1
 
 
-def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
+def _labels_inertia(X, sample_weight, x_squared_norms, centers,
+                    n_threads=None):
     """E step of the K-means EM algorithm.
 
     Compute the labels and the inertia of the given samples and centers.
 
     Parameters
     ----------
-    X : {array-like, sparse matrix} of shape (n_samples, n_features)
-        The input samples to assign to the labels. If sparse matrix, must be in
-        CSR format.
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The input samples to assign to the labels. If sparse matrix, must
+        be in CSR format.
 
-    sample_weight : array-like of shape (n_samples,)
+    sample_weight : ndarray of shape (n_samples,)
         The weights for each observation in X.
 
     x_squared_norms : ndarray of shape (n_samples,)
         Precomputed squared euclidean norm of each data point, to speed up
         computations.
 
-    centers : ndarray, shape (n_clusters, n_features)
+    centers : ndarray of shape (n_clusters, n_features)
         The cluster centers.
 
-    n_threads : int, default=1
+    n_threads : int, default=None
         The number of OpenMP threads to use for the computation. Parallelism is
         sample-wise on the main cython loop which assigns each sample to its
         closest center.
@@ -626,12 +555,13 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
         The resulting assignment
 
     inertia : float
-        Sum of squared distances of samples to their closest cluster center.
+        Sum of squared distances of samples to their closest cluster center
     """
     n_samples = X.shape[0]
     n_clusters = centers.shape[0]
 
-    sample_weight = _check_normalize_sample_weight(sample_weight, X)
+    n_threads = _openmp_effective_n_threads(n_threads)
+
     labels = np.full(n_samples, -1, dtype=np.int32)
     weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype)
     center_shift = np.zeros_like(weight_in_clusters)
@@ -652,88 +582,6 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
     return labels, inertia
 
 
-def _init_centroids(X, n_clusters=8, init="k-means++", random_state=None,
-                    x_squared_norms=None, init_size=None):
-    """Compute the initial centroids
-
-    Parameters
-    ----------
-
-    X : {ndarray, spare matrix} of shape (n_samples, n_features)
-        The input samples.
-
-    n_clusters : int, default=8
-        number of centroids.
-
-    init : {'k-means++', 'random', ndarray, callable}, default="k-means++"
-        Method for initialization.
-
-    random_state : int, RandomState instance, default=None
-        Determines random number generation for centroid initialization. Use
-        an int to make the randomness deterministic.
-        See :term:`Glossary <random_state>`.
-
-    x_squared_norms : ndarray of shape (n_samples,), default=None
-        Squared euclidean norm of each data point. Pass it if you have it at
-        hands already to avoid it being recomputed here. Default: None
-
-    init_size : int, default=None
-        Number of samples to randomly sample for speeding up the
-        initialization (sometimes at the expense of accuracy): the
-        only algorithm is initialized by running a batch KMeans on a
-        random subset of the data. This needs to be larger than k.
-
-    Returns
-    -------
-    centers : array of shape(k, n_features)
-    """
-    random_state = check_random_state(random_state)
-    n_samples = X.shape[0]
-
-    if x_squared_norms is None:
-        x_squared_norms = row_norms(X, squared=True)
-
-    if init_size is not None and init_size < n_samples:
-        if init_size < n_clusters:
-            warnings.warn(
-                "init_size=%d should be larger than k=%d. "
-                "Setting it to 3*k" % (init_size, n_clusters),
-                RuntimeWarning, stacklevel=2)
-            init_size = 3 * n_clusters
-        init_indices = random_state.randint(0, n_samples, init_size)
-        X = X[init_indices]
-        x_squared_norms = x_squared_norms[init_indices]
-        n_samples = X.shape[0]
-    elif n_samples < n_clusters:
-        raise ValueError(
-            "n_samples={} should be larger than n_clusters={}"
-            .format(n_samples, n_clusters))
-
-    if isinstance(init, str) and init == 'k-means++':
-        centers = _k_init(X, n_clusters, random_state=random_state,
-                          x_squared_norms=x_squared_norms)
-    elif isinstance(init, str) and init == 'random':
-        seeds = random_state.permutation(n_samples)[:n_clusters]
-        centers = X[seeds]
-    elif hasattr(init, '__array__'):
-        # ensure that the centers have the same dtype as X
-        # this is a requirement of fused types of cython
-        centers = np.array(init, dtype=X.dtype)
-    elif callable(init):
-        centers = init(X, n_clusters, random_state=random_state)
-        centers = np.asarray(centers, dtype=X.dtype)
-    else:
-        raise ValueError("the init parameter for the k-means should "
-                         "be 'k-means++' or 'random' or an ndarray, "
-                         "'%s' (type '%s') was passed." % (init, type(init)))
-
-    if sp.issparse(centers):
-        centers = centers.toarray()
-
-    _validate_center_shape(X, n_clusters, centers)
-    return centers
-
-
 class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     """K-Means clustering.
 
@@ -913,18 +761,162 @@ def __init__(self, n_clusters=8, init='k-means++', n_init=10,
         self.n_jobs = n_jobs
         self.algorithm = algorithm
 
+    def _check_params(self, X):
+        if self.precompute_distances != 'deprecated':
+            warnings.warn("'precompute_distances' was deprecated in version "
+                          "0.23 and will be removed in 0.25. It has no "
+                          "effect", FutureWarning)
+
+        if self.n_jobs != 'deprecated':
+            warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
+                          " removed in 0.25.", FutureWarning)
+            self._n_threads = self.n_jobs
+        else:
+            self._n_threads = None
+        self._n_threads = _openmp_effective_n_threads(self._n_threads)
+
+        if self.n_init <= 0:
+            raise ValueError(f"Invalid number of initializations. n_init="
+                             f"{self.n_init} must be bigger than zero.")
+        self._n_init = self.n_init
+
+        if self.max_iter <= 0:
+            raise ValueError(f"Number of iterations should be a positive "
+                             f"number, got {self.max_iter} instead.")
+
+        if X.shape[0] < self.n_clusters:
+            raise ValueError(f"n_samples={X.shape[0]} should be >= "
+                             f"n_clusters={self.n_clusters}.")
+
+        if self.tol < 0:
+            raise ValueError(f"tol={self.tol} should be >= 0.")
+        self._tol = self._normalize_tolerance(X, self.tol)
+
+        if self.algorithm not in ("auto", "full", "elkan"):
+            raise ValueError(f"Algorithm must be 'auto', 'full' or 'elkan', "
+                             f"got {self.algorithm}.")
+
+        self._algorithm = self.algorithm
+        if self._algorithm == "elkan" and self.n_clusters == 1:
+            warnings.warn("algorithm='elkan' doesn't make sense for a single "
+                          "cluster. Using 'full' instead.", RuntimeWarning)
+            self._algorithm = "full"
+        if self._algorithm == "auto":
+            self._algorithm = "full" if self.n_clusters == 1 else "elkan"
+
+        if hasattr(self.init, '__array__'):
+            self._validate_center_shape(X, self.init)
+            if self._n_init != 1:
+                warnings.warn(
+                    f"Explicit initial center position passed: performing only"
+                    f"one init in {self.__class__.__name__} instead of "
+                    f"n_init={self._n_init}.", RuntimeWarning, stacklevel=2)
+                self._n_init = 1
+
+    def _validate_center_shape(self, X, centers):
+        """Check if centers is compatible with X and n_clusters"""
+        if centers.shape[0] != self.n_clusters:
+            raise ValueError(
+                f"The shape of the initial centers {centers.shape} does not "
+                f"match the number of clusters {self.n_clusters}.")
+        if centers.shape[1] != X.shape[1]:
+            raise ValueError(
+                f"The shape of the initial centers {centers.shape} does not "
+                f"match the number of features of the data {X.shape[1]}.")
+
     def _check_test_data(self, X):
         X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
                         order='C', accept_large_sparse=False)
         n_samples, n_features = X.shape
         expected_n_features = self.cluster_centers_.shape[1]
         if not n_features == expected_n_features:
-            raise ValueError("Incorrect number of features. "
-                             "Got %d features, expected %d" % (
-                                 n_features, expected_n_features))
+            raise ValueError(
+                f"Incorrect number of features. Got {n_features} features, "
+                f"expected {expected_n_features}.")
 
         return X
 
+    def _normalize_tolerance(self, X, tol):
+        """Return a tolerance which is independent of the dataset"""
+        if tol == 0:
+            return 0
+        if sp.issparse(X):
+            variances = mean_variance_axis(X, axis=0)[1]
+        else:
+            variances = np.var(X, axis=0)
+        return np.mean(variances) * tol
+
+    def _init_centroids(self, X, x_squared_norms, init, random_state,
+                        init_size=None):
+        """Compute the initial centroids
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        x_squared_norms : ndarray of shape (n_samples,)
+            Squared euclidean norm of each data point. Pass it if you have it
+            at hands already to avoid it being recomputed here.
+
+        init : {'k-means++', 'random', ndarray, callable}
+            Method for initialization.
+
+        random_state : RandomState instance
+            Determines random number generation for centroid initialization.
+            See :term:`Glossary <random_state>`.
+
+        init_size : int, default=None
+            Number of samples to randomly sample for speeding up the
+            initialization (sometimes at the expense of accuracy): the only
+            algorithm is initialized by running a batch KMeans on a random
+            subset of the data. This needs to be larger than k.
+            TODO: Reword because does not mean anything
+
+        Returns
+        -------
+        centers : ndarray of shape(n_clusters, n_features)
+        """
+        n_samples = X.shape[0]
+        n_clusters = self.n_clusters
+
+        if init_size is not None and init_size < n_samples:
+            if init_size < n_clusters:
+                warnings.warn(
+                    f"init_size={init_size} should be larger than "
+                    f"n_clusters={n_clusters}. Setting it to 3*n_clusters",
+                    RuntimeWarning, stacklevel=2)
+                init_size = 3 * n_clusters
+            init_indices = random_state.randint(0, n_samples, init_size)
+            X = X[init_indices]
+            x_squared_norms = x_squared_norms[init_indices]
+            n_samples = X.shape[0]
+
+        if isinstance(init, str) and init == 'k-means++':
+            centers = _kmeans_plusplus(X, n_clusters,
+                                       random_state=random_state,
+                                       x_squared_norms=x_squared_norms)
+        elif isinstance(init, str) and init == 'random':
+            seeds = random_state.permutation(n_samples)[:n_clusters]
+            centers = X[seeds]
+        elif hasattr(init, '__array__'):
+            centers = init
+        elif callable(init):
+            centers = init(X, n_clusters, random_state=random_state)
+            centers = check_array(
+                centers, dtype=X.dtype, copy=False, order='C')
+            self._validate_center_shape(X, centers)
+        else:
+            raise ValueError(
+                f"the init parameter for {self.__class__.__name__} should be "
+                f"'k-means++', 'random', a ndarray or a callable. '{init}'"
+                f" (type '{type(self.init)}') was passed.")
+
+        if sp.issparse(centers):
+            centers = centers.toarray()
+
+        return centers
+
     def fit(self, X, y=None, sample_weight=None):
         """Compute k-means clustering.
 
@@ -949,96 +941,57 @@ def fit(self, X, y=None, sample_weight=None):
         self
             Fitted estimator.
         """
-        random_state = check_random_state(self.random_state)
-
-        if self.precompute_distances != 'deprecated':
-            warnings.warn("'precompute_distances' was deprecated in version "
-                          "0.23 and will be removed in 0.25. It has no "
-                          "effect", FutureWarning)
-
-        if self.n_jobs != 'deprecated':
-            warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
-                          " removed in 0.25.", FutureWarning)
-            self._n_threads = self.n_jobs
-        else:
-            self._n_threads = None
-        self._n_threads = _openmp_effective_n_threads(self._n_threads)
-
-        n_init = self.n_init
-        if n_init <= 0:
-            raise ValueError("Invalid number of initializations."
-                             " n_init=%d must be bigger than zero." % n_init)
-
-        if self.max_iter <= 0:
-            raise ValueError(
-                'Number of iterations should be a positive number,'
-                ' got %d instead' % self.max_iter
-            )
-
         X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
                         order='C', copy=self.copy_x, accept_large_sparse=False)
-        # verify that the number of samples given is larger than k
-        if _num_samples(X) < self.n_clusters:
-            raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
-                _num_samples(X), self.n_clusters))
 
-        tol = _tolerance(X, self.tol)
+        sample_weight = _check_normalize_sample_weight(sample_weight, X)
+
+        random_state = check_random_state(self.random_state)
 
         # Validate init array
         init = self.init
         if hasattr(init, '__array__'):
-            init = check_array(init, dtype=X.dtype.type, copy=True, order='C')
-            _validate_center_shape(X, self.n_clusters, init)
+            init = check_array(init, dtype=X.dtype, copy=True, order='C')
 
-            if n_init != 1:
-                warnings.warn(
-                    'Explicit initial center position passed: '
-                    'performing only one init in k-means instead of n_init=%d'
-                    % n_init, RuntimeWarning, stacklevel=2)
-                n_init = 1
+        self._check_params(X)
 
-        # subtract of mean of x for more accurate distance computations
+        # subtract mean of X for more accurate distance computations
         if not sp.issparse(X):
             X_mean = X.mean(axis=0)
             # The copy was already done above
             X -= X_mean
 
-            if hasattr(init, '__array__'):
+            if hasattr(self.init, '__array__'):
                 init -= X_mean
 
         # precompute squared norms of data points
         x_squared_norms = row_norms(X, squared=True)
 
-        best_labels, best_inertia, best_centers = None, None, None
-
-        algorithm = self.algorithm
-        if algorithm == "elkan" and self.n_clusters == 1:
-            warnings.warn("algorithm='elkan' doesn't make sense for a single "
-                          "cluster. Using 'full' instead.", RuntimeWarning)
-            algorithm = "full"
-
-        if algorithm == "auto":
-            algorithm = "full" if self.n_clusters == 1 else "elkan"
-
-        if algorithm == "full":
+        if self._algorithm == "full":
             kmeans_single = _kmeans_single_lloyd
-        elif algorithm == "elkan":
-            kmeans_single = _kmeans_single_elkan
         else:
-            raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
-                             " {}".format(str(algorithm)))
+            kmeans_single = _kmeans_single_elkan
 
         # seeds for the initializations of the kmeans runs.
-        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
+        seeds = random_state.randint(np.iinfo(np.int32).max, size=self._n_init)
+
+        best_labels, best_inertia, best_centers = None, None, None
 
         # limit number of threads in second level of nested parallelism
         # (i.e. BLAS) to avoid oversubsciption.
         with threadpool_limits(limits=1, user_api="blas"):
             for seed in seeds:
+                # Initialize centers
+                centers_init = self._init_centroids(
+                    X, x_squared_norms=x_squared_norms, init=init,
+                    random_state=random_state)
+                if self.verbose:
+                    print("Initialization complete")
+
                 # run a k-means once
                 labels, inertia, centers, n_iter_ = kmeans_single(
-                    X, sample_weight, self.n_clusters, max_iter=self.max_iter,
-                    init=init, verbose=self.verbose, tol=tol,
+                    X, sample_weight, centers_init, max_iter=self.max_iter,
+                    verbose=self.verbose, tol=self._tol,
                     x_squared_norms=x_squared_norms, random_state=seed,
                     n_threads=self._n_threads)
                 # determine if these results are the best so far
@@ -1114,10 +1067,6 @@ def fit_transform(self, X, y=None, sample_weight=None):
         X_new : array of shape (n_samples, n_clusters)
             X transformed in the new space.
         """
-        # Currently, this just skips a copy of the data if it is not in
-        # np.array or CSR format already.
-        # XXX This skips _check_test_data, which may change the dtype;
-        # we should refactor the input validation.
         return self.fit(X, sample_weight=sample_weight)._transform(X)
 
     def transform(self, X):
@@ -1171,6 +1120,7 @@ def predict(self, X, sample_weight=None):
 
         X = self._check_test_data(X)
         x_squared_norms = row_norms(X, squared=True)
+        sample_weight = _check_normalize_sample_weight(sample_weight, X)
 
         return _labels_inertia(X, sample_weight, x_squared_norms,
                                self.cluster_centers_, self._n_threads)[0]
@@ -1199,9 +1149,10 @@ def score(self, X, y=None, sample_weight=None):
 
         X = self._check_test_data(X)
         x_squared_norms = row_norms(X, squared=True)
+        sample_weight = _check_normalize_sample_weight(sample_weight, X)
 
         return -_labels_inertia(X, sample_weight, x_squared_norms,
-                                self.cluster_centers_)[1]
+                                self.cluster_centers_, self._n_threads)[1]
 
 
 def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
@@ -1569,6 +1520,31 @@ def __init__(self, n_clusters=8, init='k-means++', max_iter=100,
         self.init_size = init_size
         self.reassignment_ratio = reassignment_ratio
 
+    def _check_params(self, X):
+        super()._check_params(X)
+
+        if self.max_no_improvement is not None and self.max_no_improvement < 0:
+            raise ValueError(
+                f"max_no_improvement should be >= 0, got "
+                f"{self.max_no_improvement} instead.")
+
+        if self.batch_size <= 0:
+            raise ValueError(
+                f"batch_size should be > 0, got {self.batch_size} instead.")
+
+        if self.init_size is not None and self.init_size <= 0:
+            raise ValueError(
+                f"init_size should be > 0, got {self.init_size} instead.")
+        self._init_size = self.init_size
+        if self._init_size is None:
+            self._init_size = 3 * self.batch_size
+        self._init_size = min(self._init_size, X.shape[0])
+
+        if self.reassignment_ratio < 0:
+            raise ValueError(
+                f"reassignment_ratio should be >= 0, got "
+                f"{self.reassignment_ratio} instead.")
+
     def fit(self, X, y=None, sample_weight=None):
         """Compute the centroids on X by chunking it into mini-batches.
 
@@ -1590,38 +1566,31 @@ def fit(self, X, y=None, sample_weight=None):
         -------
         self
         """
-        random_state = check_random_state(self.random_state)
-        X = check_array(X, accept_sparse="csr", order='C',
-                        dtype=[np.float64, np.float32])
+        # TODO accept_large_sparse ???
+        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
+                        order='C')
         n_samples, n_features = X.shape
-        if n_samples < self.n_clusters:
-            raise ValueError("n_samples=%d should be >= n_clusters=%d"
-                             % (n_samples, self.n_clusters))
 
         sample_weight = _check_normalize_sample_weight(sample_weight, X)
 
-        n_init = self.n_init
-        if hasattr(self.init, '__array__'):
-            self.init = np.ascontiguousarray(self.init, dtype=X.dtype)
-            if n_init != 1:
-                warnings.warn(
-                    'Explicit initial center position passed: '
-                    'performing only one init in MiniBatchKMeans instead of '
-                    'n_init=%d'
-                    % self.n_init, RuntimeWarning, stacklevel=2)
-                n_init = 1
+        random_state = check_random_state(self.random_state)
 
-        x_squared_norms = row_norms(X, squared=True)
+        # Validate init array
+        init = self.init
+        if hasattr(init, '__array__'):
+            init = check_array(init, dtype=X.dtype, copy=True, order='C')
+
+        self._check_params(X)
 
-        if self.tol > 0.0:
-            tol = _tolerance(X, self.tol)
+        # precompute squared norms of data points
+        x_squared_norms = row_norms(X, squared=True)
 
+        if self._tol > 0.0:
             # using tol-based early stopping needs the allocation of a
             # dedicated before which can be expensive for high dim data:
             # hence we allocate it outside of the main loop
             old_center_buffer = np.zeros(n_features, dtype=X.dtype)
         else:
-            tol = 0.0
             # no need for the center buffer if tol-based early stopping is
             # disabled
             old_center_buffer = np.zeros(0, dtype=X.dtype)
@@ -1630,24 +1599,18 @@ def fit(self, X, y=None, sample_weight=None):
         n_batches = int(np.ceil(float(n_samples) / self.batch_size))
         n_iter = int(self.max_iter * n_batches)
 
-        init_size = self.init_size
-        if init_size is None:
-            init_size = 3 * self.batch_size
-        if init_size > n_samples:
-            init_size = n_samples
-        self.init_size_ = init_size
-
-        validation_indices = random_state.randint(0, n_samples, init_size)
+        validation_indices = random_state.randint(0, n_samples,
+                                                  self._init_size)
         X_valid = X[validation_indices]
         sample_weight_valid = sample_weight[validation_indices]
         x_squared_norms_valid = x_squared_norms[validation_indices]
 
         # perform several inits with random sub-sets
         best_inertia = None
-        for init_idx in range(n_init):
+        for init_idx in range(self._n_init):
             if self.verbose:
                 print("Init %d/%d with method: %s"
-                      % (init_idx + 1, n_init, self.init))
+                      % (init_idx + 1, self._n_init, self.init))
             weight_sums = np.zeros(self.n_clusters, dtype=sample_weight.dtype)
 
             # TODO: once the `k_means` function works with sparse input we
@@ -1655,11 +1618,9 @@ def fit(self, X, y=None, sample_weight=None):
 
             # Initialize the centers using only a fraction of the data as we
             # expect n_samples to be very large when using MiniBatchKMeans
-            cluster_centers = _init_centroids(
-                X, self.n_clusters, self.init,
-                random_state=random_state,
-                x_squared_norms=x_squared_norms,
-                init_size=init_size)
+            cluster_centers = self._init_centroids(
+                X, x_squared_norms=x_squared_norms, init=self.init,
+                random_state=random_state, init_size=self._init_size)
 
             # Compute the label assignment on the init dataset
             _mini_batch_step(
@@ -1675,7 +1636,7 @@ def fit(self, X, y=None, sample_weight=None):
                                          cluster_centers)
             if self.verbose:
                 print("Inertia for init %d/%d: %f"
-                      % (init_idx + 1, n_init, inertia))
+                      % (init_idx + 1, self._n_init, inertia))
             if best_inertia is None or inertia < best_inertia:
                 self.cluster_centers_ = cluster_centers
                 self.counts_ = weight_sums
@@ -1696,7 +1657,7 @@ def fit(self, X, y=None, sample_weight=None):
                 X[minibatch_indices], sample_weight[minibatch_indices],
                 x_squared_norms[minibatch_indices],
                 self.cluster_centers_, self.counts_,
-                old_center_buffer, tol > 0.0, distances=distances,
+                old_center_buffer, self._tol > 0.0, distances=distances,
                 # Here we randomly choose whether to perform
                 # random reassignment: the choice is done as a function
                 # of the iteration index, and the minimum number of
@@ -1710,7 +1671,7 @@ def fit(self, X, y=None, sample_weight=None):
 
             # Monitor convergence and do early stopping if necessary
             if _mini_batch_convergence(
-                    self, iteration_idx, n_iter, tol, n_samples,
+                    self, iteration_idx, n_iter, self._tol, n_samples,
                     centers_squared_diff, batch_inertia, convergence_context,
                     verbose=self.verbose):
                 break
@@ -1719,11 +1680,13 @@ def fit(self, X, y=None, sample_weight=None):
 
         if self.compute_labels:
             self.labels_, self.inertia_ = \
-                    self._labels_inertia_minibatch(X, sample_weight)
+                self._labels_inertia_minibatch(
+                    X, sample_weight, x_squared_norms, self.cluster_centers_)
 
         return self
 
-    def _labels_inertia_minibatch(self, X, sample_weight):
+    def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms,
+                                  centers):
         """Compute labels and inertia using mini batches.
 
         This is slightly slower than doing everything at once but preventes
@@ -1731,15 +1694,22 @@ def _labels_inertia_minibatch(self, X, sample_weight):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
             Input data.
 
-        sample_weight : array-like, shape (n_samples,)
+        sample_weight : ndarray of shape (n_samples,)
             The weights for each observation in X.
 
+        x_squared_norms : ndarray of shape (n_samples,)
+            Precomputed squared euclidean norm of each data point, to speed up
+            computations.
+
+        centers : ndarray of shape (n_clusters, n_features)
+            The cluster centers.
+
         Returns
         -------
-        labels : array, shape (n_samples,)
+        labels : ndarray, shape (n_samples,)
             Cluster labels for each point.
 
         inertia : float
@@ -1747,11 +1717,9 @@ def _labels_inertia_minibatch(self, X, sample_weight):
         """
         if self.verbose:
             print('Computing label assignment and total inertia')
-        sample_weight = _check_normalize_sample_weight(sample_weight, X)
-        x_squared_norms = row_norms(X, squared=True)
         slices = gen_batches(X.shape[0], self.batch_size)
         results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s],
-                                   self.cluster_centers_) for s in slices]
+                                   centers) for s in slices]
         labels, inertia = zip(*results)
         return np.hstack(labels), np.sum(inertia)
 
@@ -1788,16 +1756,17 @@ def partial_fit(self, X, y=None, sample_weight=None):
         sample_weight = _check_normalize_sample_weight(sample_weight, X)
 
         x_squared_norms = row_norms(X, squared=True)
-        self.random_state_ = getattr(self, "random_state_",
+        self._random_state = getattr(self, "_random_state",
                                      check_random_state(self.random_state))
         if (not hasattr(self, 'counts_')
                 or not hasattr(self, 'cluster_centers_')):
             # this is the first call partial_fit on this object:
             # initialize the cluster centers
-            self.cluster_centers_ = _init_centroids(
-                X, self.n_clusters, self.init,
-                random_state=self.random_state_,
-                x_squared_norms=x_squared_norms, init_size=self.init_size)
+            self.cluster_centers_ = self._init_centroids(
+                X, x_squared_norms=x_squared_norms, init=self.init,
+                random_state=self._random_state, init_size=self.init_size)
+            # TODO: should be self._init_size
+            # Should check params before
 
             self.counts_ = np.zeros(self.n_clusters,
                                     dtype=sample_weight.dtype)
@@ -1807,7 +1776,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
             # The lower the minimum count is, the more we do random
             # reassignment, however, we don't want to do random
             # reassignment too often, to allow for building up counts
-            random_reassign = self.random_state_.randint(
+            random_reassign = self._random_state.randint(
                 10 * (1 + self.counts_.min())) == 0
             distances = np.zeros(X.shape[0], dtype=X.dtype)
 
@@ -1822,7 +1791,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
                          self.cluster_centers_, self.counts_,
                          np.zeros(0, dtype=X.dtype), 0,
                          random_reassign=random_reassign, distances=distances,
-                         random_state=self.random_state_,
+                         random_state=self._random_state,
                          reassignment_ratio=self.reassignment_ratio,
                          verbose=self.verbose)
 
@@ -1856,4 +1825,8 @@ def predict(self, X, sample_weight=None):
         check_is_fitted(self)
 
         X = self._check_test_data(X)
-        return self._labels_inertia_minibatch(X, sample_weight)[0]
+        x_squared_norms = row_norms(X, squared=True)
+        sample_weight = _check_normalize_sample_weight(sample_weight, X)
+
+        return self._labels_inertia_minibatch(
+            X, sample_weight, x_squared_norms, self.cluster_centers_)[0]
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 2bcbc3faa517f..bf23d669da654 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -209,7 +209,7 @@ def test_labels_assignment_and_inertia():
     assert (mindist >= 0.0).all()
     assert (labels_gold != -1).all()
 
-    sample_weight = None
+    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
 
     # perform label assignment using the dense array input
     x_squared_norms = (X ** 2).sum(axis=1)
@@ -599,7 +599,7 @@ def test_minibatch_default_init_size():
     mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
                                  batch_size=10, random_state=42,
                                  n_init=1).fit(X)
-    assert mb_k_means.init_size_ == 3 * mb_k_means.batch_size
+    assert mb_k_means._init_size == 3 * mb_k_means.batch_size
     _check_fitted_model(mb_k_means)
 
 
@@ -614,7 +614,7 @@ def test_minibatch_set_init_size():
                                  init_size=666, random_state=42,
                                  n_init=1).fit(X)
     assert mb_k_means.init_size == 666
-    assert mb_k_means.init_size_ == n_samples
+    assert mb_k_means._init_size == n_samples
     _check_fitted_model(mb_k_means)
 
 

From 7f85bcaab20e8883256cca998c6b1afb544333c9 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 27 Feb 2020 10:25:29 +0100
Subject: [PATCH 02/72] wip

---
 sklearn/cluster/_kmeans.py             |   93 +-
 sklearn/cluster/tests/test_k_means.py  | 1470 +++++++++---------------
 sklearn/cluster/tests/test_k_means2.py |  190 +++
 3 files changed, 787 insertions(+), 966 deletions(-)
 create mode 100644 sklearn/cluster/tests/test_k_means2.py

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index c36acf122445e..ad9e7eab1ea2c 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -776,25 +776,25 @@ def _check_params(self, X):
         self._n_threads = _openmp_effective_n_threads(self._n_threads)
 
         if self.n_init <= 0:
-            raise ValueError(f"Invalid number of initializations. n_init="
-                             f"{self.n_init} must be bigger than zero.")
+            raise ValueError(
+                f"n_init should be > 0, got {self.n_init} instead.")
         self._n_init = self.n_init
 
         if self.max_iter <= 0:
-            raise ValueError(f"Number of iterations should be a positive "
-                             f"number, got {self.max_iter} instead.")
+            raise ValueError(
+                f"max_iter should be > 0, got {self.max_iter} instead.")
 
         if X.shape[0] < self.n_clusters:
             raise ValueError(f"n_samples={X.shape[0]} should be >= "
                              f"n_clusters={self.n_clusters}.")
 
         if self.tol < 0:
-            raise ValueError(f"tol={self.tol} should be >= 0.")
+            raise ValueError(f"tol should be >= 0, got {self.tol} instead.")
         self._tol = self._normalize_tolerance(X, self.tol)
 
         if self.algorithm not in ("auto", "full", "elkan"):
             raise ValueError(f"Algorithm must be 'auto', 'full' or 'elkan', "
-                             f"got {self.algorithm}.")
+                             f"got {self.algorithm} instead.")
 
         self._algorithm = self.algorithm
         if self._algorithm == "elkan" and self.n_clusters == 1:
@@ -804,12 +804,19 @@ def _check_params(self, X):
         if self._algorithm == "auto":
             self._algorithm = "full" if self.n_clusters == 1 else "elkan"
 
+        if not (hasattr(self.init, '__array__') or callable(self.init)
+                or (isinstance(self.init, str)
+                    and self.init in ["k-means++", "random"])):
+            raise ValueError(
+                f"init should be either 'k-means++', 'random', a ndarray or a "
+                f"callable, got '{self.init}' instead.")
+
         if hasattr(self.init, '__array__'):
             self._validate_center_shape(X, self.init)
             if self._n_init != 1:
                 warnings.warn(
                     f"Explicit initial center position passed: performing only"
-                    f"one init in {self.__class__.__name__} instead of "
+                    f" one init in {self.__class__.__name__} instead of "
                     f"n_init={self._n_init}.", RuntimeWarning, stacklevel=2)
                 self._n_init = 1
 
@@ -881,12 +888,6 @@ def _init_centroids(self, X, x_squared_norms, init, random_state,
         n_clusters = self.n_clusters
 
         if init_size is not None and init_size < n_samples:
-            if init_size < n_clusters:
-                warnings.warn(
-                    f"init_size={init_size} should be larger than "
-                    f"n_clusters={n_clusters}. Setting it to 3*n_clusters",
-                    RuntimeWarning, stacklevel=2)
-                init_size = 3 * n_clusters
             init_indices = random_state.randint(0, n_samples, init_size)
             X = X[init_indices]
             x_squared_norms = x_squared_norms[init_indices]
@@ -906,11 +907,6 @@ def _init_centroids(self, X, x_squared_norms, init, random_state,
             centers = check_array(
                 centers, dtype=X.dtype, copy=False, order='C')
             self._validate_center_shape(X, centers)
-        else:
-            raise ValueError(
-                f"the init parameter for {self.__class__.__name__} should be "
-                f"'k-means++', 'random', a ndarray or a callable. '{init}'"
-                f" (type '{type(self.init)}') was passed.")
 
         if sp.issparse(centers):
             centers = centers.toarray()
@@ -1377,20 +1373,22 @@ class MiniBatchKMeans(KMeans):
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    init : {'k-means++', 'random'} or ndarray of shape \
-            (n_clusters, n_features), default='k-means++'
-        Method for initialization
+    init : {'k-means++', 'random', ndarray, callable}, default='k-means++'
+        Method for initialization:
 
         'k-means++' : selects initial cluster centers for k-mean
         clustering in a smart way to speed up convergence. See section
         Notes in k_init for more details.
 
-        'random': choose k observations (rows) at random from data for
-        the initial centroids.
+        'random': choose `n_clusters` observations (rows) at random from data
+        for the initial centroids.
 
         If an ndarray is passed, it should be of shape (n_clusters, n_features)
         and gives the initial centers.
 
+        If a callable is passed, it should take arguments X, n_clusters and a
+        random state and return an initialization.
+
     max_iter : int, default=100
         Maximum number of iterations over the complete dataset before
         stopping independently of any early stopping criterion heuristics.
@@ -1454,7 +1452,7 @@ class MiniBatchKMeans(KMeans):
     cluster_centers_ : ndarray of shape (n_clusters, n_features)
         Coordinates of cluster centers
 
-    labels_ : int
+    labels_ : ndarray of shape (n_samples)
         Labels of each point (if compute_labels is set to True).
 
     inertia_ : float
@@ -1538,6 +1536,14 @@ def _check_params(self, X):
         self._init_size = self.init_size
         if self._init_size is None:
             self._init_size = 3 * self.batch_size
+            if self._init_size < self.n_clusters:
+                self._init_size = 3 * self.n_clusters
+        elif self._init_size < self.n_clusters:
+            warnings.warn(
+                f"init_size={self._init_size} should be larger than "
+                f"n_clusters={self.n_clusters}. Setting it to 3*n_clusters",
+                RuntimeWarning, stacklevel=2)
+            self._init_size = 3 * self.n_clusters
         self._init_size = min(self._init_size, X.shape[0])
 
         if self.reassignment_ratio < 0:
@@ -1550,7 +1556,7 @@ def fit(self, X, y=None, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape=(n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Training instances to cluster. It must be noted that the data
             will be converted to C ordering, which will cause a memory copy
             if the given data is not C-contiguous.
@@ -1558,9 +1564,9 @@ def fit(self, X, y=None, sample_weight=None):
         y : Ignored
             Not used, present here for API consistency by convention.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like of shape (n_samples,), default=None
             The weights for each observation in X. If None, all observations
-            are assigned equal weight (default: None).
+            are assigned equal weight.
 
         Returns
         -------
@@ -1609,8 +1615,8 @@ def fit(self, X, y=None, sample_weight=None):
         best_inertia = None
         for init_idx in range(self._n_init):
             if self.verbose:
-                print("Init %d/%d with method: %s"
-                      % (init_idx + 1, self._n_init, self.init))
+                print(f"Init {init_idx + 1}/{self._n_init} with method {init}")
+
             weight_sums = np.zeros(self.n_clusters, dtype=sample_weight.dtype)
 
             # TODO: once the `k_means` function works with sparse input we
@@ -1619,15 +1625,14 @@ def fit(self, X, y=None, sample_weight=None):
             # Initialize the centers using only a fraction of the data as we
             # expect n_samples to be very large when using MiniBatchKMeans
             cluster_centers = self._init_centroids(
-                X, x_squared_norms=x_squared_norms, init=self.init,
+                X, x_squared_norms=x_squared_norms, init=init,
                 random_state=random_state, init_size=self._init_size)
 
             # Compute the label assignment on the init dataset
             _mini_batch_step(
-                X_valid, sample_weight_valid,
-                x_squared_norms[validation_indices], cluster_centers,
-                weight_sums, old_center_buffer, False, distances=None,
-                verbose=self.verbose)
+                X_valid, sample_weight_valid, x_squared_norms_valid,
+                cluster_centers, weight_sums, old_center_buffer, False,
+                distances=None, verbose=self.verbose)
 
             # Keep only the best cluster centers across independent inits on
             # the common validation set
@@ -1635,8 +1640,8 @@ def fit(self, X, y=None, sample_weight=None):
                                          x_squared_norms_valid,
                                          cluster_centers)
             if self.verbose:
-                print("Inertia for init %d/%d: %f"
-                      % (init_idx + 1, self._n_init, inertia))
+                print(f"Inertia for init {init_idx + 1}/{self._n_init}: "
+                      f"{inertia}")
             if best_inertia is None or inertia < best_inertia:
                 self.cluster_centers_ = cluster_centers
                 self.counts_ = weight_sums
@@ -1709,7 +1714,7 @@ def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms,
 
         Returns
         -------
-        labels : ndarray, shape (n_samples,)
+        labels : ndarray of shape (n_samples,)
             Cluster labels for each point.
 
         inertia : float
@@ -1735,9 +1740,9 @@ def partial_fit(self, X, y=None, sample_weight=None):
         y : Ignored
             Not used, present here for API consistency by convention.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like of shape (n_samples,), default=None
             The weights for each observation in X. If None, all observations
-            are assigned equal weight (default: None).
+            are assigned equal weight.
 
         Returns
         -------
@@ -1784,8 +1789,8 @@ def partial_fit(self, X, y=None, sample_weight=None):
             # of features.
             if X.shape[1] != self.cluster_centers_.shape[1]:
                 raise ValueError(
-                    "Number of features %d does not match previous "
-                    "data %d." % (X.shape[1], self.cluster_centers_.shape[1]))
+                    f"Number of features {X.shape[1]} does not match previous "
+                    f"data {self.cluster_centers_.shape[1]}.")
 
         _mini_batch_step(X, sample_weight, x_squared_norms,
                          self.cluster_centers_, self.counts_,
@@ -1813,13 +1818,13 @@ def predict(self, X, sample_weight=None):
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
             New data to predict.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like of shape (n_samples,), default=None
             The weights for each observation in X. If None, all observations
-            are assigned equal weight (default: None).
+            are assigned equal weight.
 
         Returns
         -------
-        labels : array, shape [n_samples,]
+        labels : ndarray of shape (n_samples,)
             Index of the cluster each sample belongs to.
         """
         check_is_fitted(self)
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index bf23d669da654..fd48c7b73842c 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -8,23 +8,21 @@
 import pytest
 
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import assert_warns_message
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils.validation import _num_samples
 from sklearn.base import clone
 from sklearn.exceptions import ConvergenceWarning
 
 from sklearn.utils.extmath import row_norms
+from sklearn.metrics import pairwise_distances
 from sklearn.metrics import pairwise_distances_argmin
 from sklearn.metrics.cluster import v_measure_score
 from sklearn.cluster import KMeans, k_means
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.cluster._kmeans import _labels_inertia
 from sklearn.cluster._kmeans import _mini_batch_step
+from sklearn.cluster._kmeans import _check_normalize_sample_weight
 from sklearn.cluster._k_means_fast import _relocate_empty_clusters_dense
 from sklearn.cluster._k_means_fast import _relocate_empty_clusters_sparse
 from sklearn.cluster._k_means_fast import _euclidean_dense_dense_wrapper
@@ -33,7 +31,6 @@
 from sklearn.cluster._k_means_fast import _inertia_sparse
 from sklearn.datasets import make_blobs
 from io import StringIO
-from sklearn.metrics.cluster import homogeneity_score
 
 
 # non centered, sparse centers to check the
@@ -49,12 +46,27 @@
 X_csr = sp.csr_matrix(X)
 
 
-@pytest.mark.parametrize("representation", ["dense", "sparse"])
+def _check_fitted_model(km):
+    # check that the number of clusters centers and distinct labels match
+    # the expectation
+    centers = km.cluster_centers_
+    assert centers.shape == (n_clusters, n_features)
+
+    labels = km.labels_
+    assert np.unique(labels).shape[0] == n_clusters
+
+    # check that the labels assignment are perfect (up to a permutation)
+    assert_allclose(v_measure_score(true_labels, labels), 1.0)
+    assert km.inertia_ > 0.0
+
+
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
 @pytest.mark.parametrize("algo", ["full", "elkan"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_kmeans_results(representation, algo, dtype):
-    # cheks that kmeans works as intended
-    array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation]
+def test_kmeans_results(array_constr, algo, dtype):
+    # Checks that KMeans works as intended on toy dataset by comparing with
+    # expected results computed by hand.
     X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
     sample_weight = [3, 1, 1, 3]  # will be rescaled to [1.5, 0.5, 0.5, 1.5]
     init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)
@@ -68,954 +80,445 @@ def test_kmeans_results(representation, algo, dtype):
     kmeans.fit(X, sample_weight=sample_weight)
 
     assert_array_equal(kmeans.labels_, expected_labels)
-    assert_almost_equal(kmeans.inertia_, expected_inertia)
-    assert_array_almost_equal(kmeans.cluster_centers_, expected_centers)
-    assert kmeans.n_iter_ == expected_n_iter
-
-
-@pytest.mark.parametrize("array_constr",
-                         [np.array, sp.csr_matrix],
-                         ids=['dense', 'sparse'])
-@pytest.mark.parametrize("algo", ['full', 'elkan'])
-def test_relocated_clusters(array_constr, algo):
-    # check that empty clusters are relocated as expected
-    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]])
-
-    # second center too far from others points will be empty at first iter
-    init_centers = np.array([[0.5, 0.5], [3, 3]])
-
-    expected_labels = [0, 0, 1, 1]
-    expected_inertia = 0.25
-    expected_centers = [[0.25, 0], [0.75, 1]]
-    expected_n_iter = 3
-
-    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
-    kmeans.fit(X)
-
-    assert_array_equal(kmeans.labels_, expected_labels)
-    assert_almost_equal(kmeans.inertia_, expected_inertia)
-    assert_array_almost_equal(kmeans.cluster_centers_, expected_centers)
+    assert_allclose(kmeans.inertia_, expected_inertia)
+    assert_allclose(kmeans.cluster_centers_, expected_centers)
     assert kmeans.n_iter_ == expected_n_iter
 
 
-@pytest.mark.parametrize("representation", ["dense", "sparse"])
-def test_relocate_empty_clusters(representation):
-    # test for the _relocate_empty_clusters_(dense/sparse) helpers
-
-    # Synthetic dataset with 3 obvious clusters of different sizes
-    X = np.array(
-        [-10., -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1)
-    if representation == "sparse":
-        X = sp.csr_matrix(X)
-    sample_weight = np.full(shape=10, fill_value=1.)
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("algo", ["full", "elkan"])
+def test_k_means_1_iteration(array_constr, algo):
+    # check the results after a single iteration (E-step M-step E-step) by
+    # comparing against a pure python implementation.
+    X = np.random.RandomState(0).uniform(size=(100, 5))
+    init_centers = X[:5]
+    X = array_constr(X)
 
-    # centers all initialized to the first point of X
-    centers_old = np.array([-10., -10, -10]).reshape(-1, 1)
+    def py_kmeans(X, init):
+        new_centers = init.copy()
+        labels = pairwise_distances_argmin(X, init)
+        for label in range(init.shape[0]):
+            new_centers[label] = X[labels == label].mean(axis=0)
+        labels = pairwise_distances_argmin(X, new_centers)
+        return labels, new_centers
 
-    # With this initialization, all points will be assigned to the first center
-    # At this point a center in centers_new is the weighted sum of the points
-    # it contains if it's not empty, otherwise it is the same as before.
-    centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1)
-    weight_in_clusters = np.array([10., 0, 0])
-    labels = np.zeros(10, dtype=np.int32)
+    py_labels, py_centers = py_kmeans(X, init_centers)
 
-    if representation == "dense":
-        _relocate_empty_clusters_dense(X, sample_weight, centers_old,
-                                       centers_new, weight_in_clusters, labels)
-    else:
-        _relocate_empty_clusters_sparse(X.data, X.indices, X.indptr,
-                                        sample_weight, centers_old,
-                                        centers_new, weight_in_clusters,
-                                        labels)
+    cy_kmeans = KMeans(n_clusters=5, n_init=1, init=init_centers,
+                       algorithm=algo, max_iter=1).fit(X)
+    cy_labels = cy_kmeans.labels_
+    cy_centers = cy_kmeans.cluster_centers_
 
-    # The relocation scheme will take the 2 points farthest from the center and
-    # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The
-    # first center will be updated to contain the other 8 points.
-    assert_array_equal(weight_in_clusters, [8, 1, 1])
-    assert_allclose(centers_new, [[-36], [10], [9.5]])
+    assert_array_equal(py_labels, cy_labels)
+    assert_allclose(py_centers, cy_centers)
 
 
-@pytest.mark.parametrize('distribution', ['normal', 'blobs'])
-@pytest.mark.parametrize('tol', [1e-2, 1e-4, 1e-8])
-def test_elkan_results(distribution, tol):
-    # check that results are identical between lloyd and elkan algorithms
+@pytest.mark.parametrize("distribution", ["normal", "blobs"])
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8])
+def test_elkan_results(distribution, array_constr, tol):
+    # Check that results are identical between lloyd and elkan algorithms
     rnd = np.random.RandomState(0)
-    if distribution == 'normal':
+    if distribution == "normal":
         X = rnd.normal(size=(5000, 10))
     else:
         X, _ = make_blobs(random_state=rnd)
+    X[X < 0] = 0
+    X = array_constr(X)
 
-    km_full = KMeans(algorithm='full', n_clusters=5,
+    km_full = KMeans(algorithm="full", n_clusters=5,
                      random_state=0, n_init=1, tol=tol)
-    km_elkan = KMeans(algorithm='elkan', n_clusters=5,
+    km_elkan = KMeans(algorithm="elkan", n_clusters=5,
                       random_state=0, n_init=1, tol=tol)
 
     km_full.fit(X)
     km_elkan.fit(X)
     assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_)
     assert_array_equal(km_elkan.labels_, km_full.labels_)
-
     assert km_elkan.n_iter_ == km_full.n_iter_
     assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6)
 
 
-@pytest.mark.parametrize('algorithm', ['full', 'elkan'])
+@pytest.mark.parametrize("algorithm", ["full", "elkan"])
 def test_kmeans_convergence(algorithm):
     # Check that KMeans stops when convergence is reached when tol=0. (#16075)
+    # We can only ensure that if the number of threads is not to large,
+    # otherwise the roundings errors coming from the unpredictability of
+    # the order in which chunks are processed make the convergence criterion
+    # to never be exactly 0.
     rnd = np.random.RandomState(0)
     X = rnd.normal(size=(5000, 10))
 
-    km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0, n_init=1,
-                tol=0, max_iter=300).fit(X)
+    with threadpool_limits(limits=1, user_api="openmp"):
+        km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0,
+                    n_init=1, tol=0, max_iter=300).fit(X)
 
     assert km.n_iter_ < 300
 
 
-@pytest.mark.parametrize('distribution', ['normal', 'blobs'])
-def test_elkan_results_sparse(distribution):
-    # check that results are identical between lloyd and elkan algorithms
-    # with sparse input
-    rnd = np.random.RandomState(0)
-    if distribution == 'normal':
-        X = sp.random(100, 100, density=0.1, format='csr', random_state=rnd)
-        X.data = rnd.randn(len(X.data))
-    else:
-        X, _ = make_blobs(n_samples=100, n_features=100, random_state=rnd)
-        X = sp.csr_matrix(X)
-
-    km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1)
-    km_elkan = KMeans(algorithm='elkan', n_clusters=5,
-                      random_state=0, n_init=1)
-
-    km_full.fit(X)
-    km_elkan.fit(X)
-    assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_)
-    assert_allclose(km_elkan.labels_, km_full.labels_)
-
-
-def test_labels_assignment_and_inertia():
-    # pure numpy implementation as easily auditable reference gold
-    # implementation
-    rng = np.random.RandomState(42)
-    noisy_centers = centers + rng.normal(size=centers.shape)
-    labels_gold = np.full(n_samples, -1, dtype=np.int)
-    mindist = np.empty(n_samples)
-    mindist.fill(np.infty)
-    for center_id in range(n_clusters):
-        dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1)
-        labels_gold[dist < mindist] = center_id
-        mindist = np.minimum(dist, mindist)
-    inertia_gold = mindist.sum()
-    assert (mindist >= 0.0).all()
-    assert (labels_gold != -1).all()
-
-    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
-
-    # perform label assignment using the dense array input
-    x_squared_norms = (X ** 2).sum(axis=1)
-    labels_array, inertia_array = _labels_inertia(
-        X, sample_weight, x_squared_norms, noisy_centers)
-    assert_array_almost_equal(inertia_array, inertia_gold)
-    assert_array_equal(labels_array, labels_gold)
-
-    # perform label assignment using the sparse CSR input
-    x_squared_norms_from_csr = row_norms(X_csr, squared=True)
-    labels_csr, inertia_csr = _labels_inertia(
-        X_csr, sample_weight, x_squared_norms_from_csr, noisy_centers)
-    assert_array_almost_equal(inertia_csr, inertia_gold)
-    assert_array_equal(labels_csr, labels_gold)
-
-
-def test_minibatch_update_consistency():
-    # Check that dense and sparse minibatch update give the same results
-    rng = np.random.RandomState(42)
-    old_centers = centers + rng.normal(size=centers.shape)
-
-    new_centers = old_centers.copy()
-    new_centers_csr = old_centers.copy()
-
-    weight_sums = np.zeros(new_centers.shape[0], dtype=np.double)
-    weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double)
-
-    x_squared_norms = (X ** 2).sum(axis=1)
-    x_squared_norms_csr = row_norms(X_csr, squared=True)
-
-    buffer = np.zeros(centers.shape[1], dtype=np.double)
-    buffer_csr = np.zeros(centers.shape[1], dtype=np.double)
-
-    # extract a small minibatch
-    X_mb = X[:10]
-    X_mb_csr = X_csr[:10]
-    x_mb_squared_norms = x_squared_norms[:10]
-    x_mb_squared_norms_csr = x_squared_norms_csr[:10]
-
-    sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double)
-
-    # step 1: compute the dense minibatch update
-    old_inertia, incremental_diff = _mini_batch_step(
-        X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums,
-        buffer, 1, None, random_reassign=False)
-    assert old_inertia > 0.0
-
-    # compute the new inertia on the same batch to check that it decreased
-    labels, new_inertia = _labels_inertia(
-        X_mb, sample_weight_mb, x_mb_squared_norms, new_centers)
-    assert new_inertia > 0.0
-    assert new_inertia < old_inertia
-
-    # check that the incremental difference computation is matching the
-    # final observed value
-    effective_diff = np.sum((new_centers - old_centers) ** 2)
-    assert_almost_equal(incremental_diff, effective_diff)
-
-    # step 2: compute the sparse minibatch update
-    old_inertia_csr, incremental_diff_csr = _mini_batch_step(
-        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr,
-        weight_sums_csr, buffer_csr, 1, None, random_reassign=False)
-    assert old_inertia_csr > 0.0
-
-    # compute the new inertia on the same batch to check that it decreased
-    labels_csr, new_inertia_csr = _labels_inertia(
-        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr)
-    assert new_inertia_csr > 0.0
-    assert new_inertia_csr < old_inertia_csr
-
-    # check that the incremental difference computation is matching the
-    # final observed value
-    effective_diff = np.sum((new_centers_csr - old_centers) ** 2)
-    assert_almost_equal(incremental_diff_csr, effective_diff)
-
-    # step 3: check that sparse and dense updates lead to the same results
-    assert_array_equal(labels, labels_csr)
-    assert_array_almost_equal(new_centers, new_centers_csr)
-    assert_almost_equal(incremental_diff, incremental_diff_csr)
-    assert_almost_equal(old_inertia, old_inertia_csr)
-    assert_almost_equal(new_inertia, new_inertia_csr)
-
-
-def _check_fitted_model(km):
-    # check that the number of clusters centers and distinct labels match
-    # the expectation
-    centers = km.cluster_centers_
-    assert centers.shape == (n_clusters, n_features)
-
-    labels = km.labels_
-    assert np.unique(labels).shape[0] == n_clusters
-
-    # check that the labels assignment are perfect (up to a permutation)
-    assert v_measure_score(true_labels, labels) == 1.0
-    assert km.inertia_ > 0.0
-
-    # check error on dataset being too small
-    assert_raise_message(ValueError, "n_samples=1 should be >= n_clusters=%d"
-                         % km.n_clusters, km.fit, [[0., 1.]])
-
-
-def test_k_means_new_centers():
-    # Explore the part of the code where a new center is reassigned
-    X = np.array([[0, 0, 1, 1],
-                  [0, 0, 0, 0],
-                  [0, 1, 0, 0],
-                  [0, 0, 0, 0],
-                  [0, 0, 0, 0],
-                  [0, 1, 0, 0]])
-    labels = [0, 1, 2, 1, 1, 2]
-    bad_centers = np.array([[+0, 1, 0, 0],
-                            [.2, 0, .2, .2],
-                            [+0, 0, 0, 0]])
-
-    km = KMeans(n_clusters=3, init=bad_centers, n_init=1, max_iter=10,
-                random_state=1)
-    for this_X in (X, sp.coo_matrix(X)):
-        km.fit(this_X)
-        this_labels = km.labels_
-        # Reorder the labels so that the first instance is in cluster 0,
-        # the second in cluster 1, ...
-        this_labels = np.unique(this_labels, return_index=True)[1][this_labels]
-        np.testing.assert_array_equal(this_labels, labels)
-
-
-@pytest.mark.parametrize('data', [X, X_csr], ids=['dense', 'sparse'])
-@pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()])
-def test_k_means_init(data, init):
-    km = KMeans(init=init, n_clusters=n_clusters, random_state=42, n_init=1)
-    km.fit(data)
+@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
+@pytest.mark.parametrize("init", ["random", "k-means++", centers,
+                                  lambda X, k, random_state: centers],
+                         ids=["random", "k-means++", "ndarray", "callable"])
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_all_init(estimator, data, init):
+    # Check KMeans and MiniBatchKMeans with all possible init.
+    km = estimator(init=init, n_clusters=n_clusters, random_state=42,
+                   n_init=10).fit(data)
     _check_fitted_model(km)
 
 
-def test_k_means_n_init():
-    rnd = np.random.RandomState(0)
-    X = rnd.normal(size=(40, 2))
-
-    # two regression tests on bad n_init argument
-    # previous bug: n_init <= 0 threw non-informative TypeError (#3858)
-    with pytest.raises(ValueError, match="n_init"):
-        KMeans(n_init=0).fit(X)
-    with pytest.raises(ValueError, match="n_init"):
-        KMeans(n_init=-1).fit(X)
-
-
-@pytest.mark.parametrize('Class', [KMeans, MiniBatchKMeans])
-def test_k_means_explicit_init_shape(Class):
-    # test for sensible errors when giving explicit init
-    # with wrong number of features or clusters
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_result_of_kmeans_equal_in_diff_n_threads(estimator):
+    # Check that KMeans gives the same results in parallel mode than in
+    # sequential mode.
     rnd = np.random.RandomState(0)
-    X = rnd.normal(size=(40, 3))
+    X = rnd.normal(size=(50, 10))
 
-    # mismatch of number of features
-    km = Class(n_init=1, init=X[:, :2], n_clusters=len(X))
-    msg = "does not match the number of features of the data"
-    with pytest.raises(ValueError, match=msg):
-        km.fit(X)
-    # for callable init
-    km = Class(n_init=1,
-               init=lambda X_, k, random_state: X_[:, :2],
-               n_clusters=len(X))
-    with pytest.raises(ValueError, match=msg):
-        km.fit(X)
-    # mismatch of number of clusters
-    msg = "does not match the number of clusters"
-    km = Class(n_init=1, init=X[:2, :], n_clusters=3)
-    with pytest.raises(ValueError, match=msg):
-        km.fit(X)
-    # for callable init
-    km = Class(n_init=1,
-               init=lambda X_, k, random_state: X_[:2, :],
-               n_clusters=3)
-    with pytest.raises(ValueError, match=msg):
-        km.fit(X)
+    with threadpool_limits(limits=1, user_api="openmp"):
+        result_1 = estimator(
+            n_clusters=n_clusters, random_state=0).fit(X).labels_
+    with threadpool_limits(limits=2, user_api="openmp"):
+        result_2 = estimator(
+            n_clusters=n_clusters, random_state=0).fit(X).labels_
+    assert_array_equal(result_1, result_2)
 
 
-def test_k_means_fortran_aligned_data():
-    # Check the KMeans will work well, even if X is a fortran-aligned data.
-    X = np.asfortranarray([[0, 0], [0, 1], [0, 1]])
-    centers = np.array([[0, 0], [0, 1]])
-    labels = np.array([0, 1, 1])
-    km = KMeans(n_init=1, init=centers, random_state=42, n_clusters=2)
-    km.fit(X)
-    assert_array_almost_equal(km.cluster_centers_, centers)
-    assert_array_equal(km.labels_, labels)
-
-
-@pytest.mark.parametrize('algo', ['full', 'elkan'])
-@pytest.mark.parametrize('dtype', [np.float32, np.float64])
-@pytest.mark.parametrize('constructor', [np.asarray, sp.csr_matrix])
-@pytest.mark.parametrize('seed, max_iter, tol', [
-    (0, 2, 1e-7),    # strict non-convergence
-    (1, 2, 1e-1),    # loose non-convergence
-    (3, 300, 1e-7),  # strict convergence
-    (4, 300, 1e-1),  # loose convergence
-])
-def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
-    # check that fit.predict gives same result as fit_predict
-    # There's a very small chance of failure with elkan on unstructured dataset
-    # because predict method uses fast euclidean distances computation which
-    # may cause small numerical instabilities.
-    # NB: This test is largely redundant with respect to test_predict and
-    #     test_predict_equal_labels.  This test has the added effect of
-    #     testing idempotence of the fittng procesdure which appears to
-    #     be where it fails on some MacOS setups.
-    if sys.platform == "darwin":
-        pytest.xfail(
-            "Known failures on MacOS, See "
-            "https://github.com/scikit-learn/scikit-learn/issues/12644")
+def test_check_normalize_sample_weight():
+    # Check the check sample weight helper. sample weights should sum to
+    # n_samples
+    sample_weight = None
+    checked_sample_weight = _check_normalize_sample_weight(sample_weight, X)
+    assert _num_samples(X) == _num_samples(checked_sample_weight)
+    assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
+    assert X.dtype == checked_sample_weight.dtype
 
-    rng = np.random.RandomState(seed)
 
-    X = make_blobs(n_samples=1000, n_features=10, centers=10,
-                   random_state=rng)[0].astype(dtype, copy=False)
-    X = constructor(X)
+def _sort_centers(centers):
+    return np.sort(centers, axis=0)
 
-    kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
-                    tol=tol, max_iter=max_iter)
 
-    labels_1 = kmeans.fit(X).predict(X)
-    labels_2 = kmeans.fit_predict(X)
+@pytest.mark.parametrize("init", ["k-means++", centers],
+                         ids=["k-means++", "ndarray"])
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_weighted_vs_repeated(estimator, init):
+    # Check that a sample weight of N should yield the same result as an N-fold
+    # repetition of the sample
+    sample_weight = np.random.RandomState(0).randint(1, 5, size=n_samples)
+    X_repeat = np.repeat(X, sample_weight, axis=0)
 
-    # Due to randomness in the order in which chunks of data are processed when
-    # using more than one thread, the absolute values of the labels can be
-    # different between the 2 strategies but they should correspond to the same
-    # clustering.
-    assert v_measure_score(labels_1, labels_2) == 1
+    km = estimator(init=init, n_clusters=n_clusters, random_state=0)
+    if estimator is MiniBatchKMeans:
+        km.set_params(batch_size=10)
 
+    km_weighted = clone(km).fit(X, sample_weight=sample_weight)
+    repeated_labels = np.repeat(km_weighted.labels_, sample_weight)
+    km_repeated = clone(km).fit(X_repeat)
 
-def test_mb_kmeans_verbose():
-    mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
-                                 random_state=42, verbose=1)
-    old_stdout = sys.stdout
-    sys.stdout = StringIO()
-    try:
-        mb_k_means.fit(X)
-    finally:
-        sys.stdout = old_stdout
+    # We can't expect labels to be equal because k-means++ will lead to
+    # a different initialization on duplicated X.
+    assert_allclose(v_measure_score(km_repeated.labels_, repeated_labels), 1)
 
+    # TODO: FIXME
+    if estimator is not MiniBatchKMeans:
+        assert_allclose(_sort_centers(km_weighted.cluster_centers_),
+                        _sort_centers(km_repeated.cluster_centers_))
 
-def test_minibatch_init_with_large_k():
-    mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20)
-    # Check that a warning is raised, as the number clusters is larger
-    # than the init_size
-    assert_warns(RuntimeWarning, mb_k_means.fit, X)
-
-
-def test_minibatch_k_means_init_multiple_runs_with_explicit_centers():
-    mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
-                                 random_state=42, n_init=10)
-    assert_warns(RuntimeWarning, mb_k_means.fit, X)
-
-
-@pytest.mark.parametrize('data', [X, X_csr], ids=['dense', 'sparse'])
-@pytest.mark.parametrize('init', ["random", 'k-means++', centers.copy()])
-def test_minibatch_k_means_init(data, init):
-    mb_k_means = MiniBatchKMeans(init=init, n_clusters=n_clusters,
-                                 random_state=42, n_init=10)
-    mb_k_means.fit(data)
-    _check_fitted_model(mb_k_means)
-
-
-def test_minibatch_sensible_reassign_fit():
-    # check if identical initial clusters are reassigned
-    # also a regression test for when there are more desired reassignments than
-    # samples.
-    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
-                                       cluster_std=1., random_state=42)
-    zeroed_X[::2, :] = 0
-    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
-                                 init="random")
-    mb_k_means.fit(zeroed_X)
-    # there should not be too many exact zero cluster centers
-    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
-
-    # do the same with batch-size > X.shape[0] (regression test)
-    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
-                                 random_state=42, init="random")
-    mb_k_means.fit(zeroed_X)
-    # there should not be too many exact zero cluster centers
-    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
-
-
-def test_minibatch_sensible_reassign_partial_fit():
-    zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5,
-                                       cluster_std=1., random_state=42)
-    zeroed_X[::2, :] = 0
-    mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random")
-    for i in range(100):
-        mb_k_means.partial_fit(zeroed_X)
-    # there should not be too many exact zero cluster centers
-    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
-
-
-def test_minibatch_reassign():
-    # Give a perfect initialization, but a large reassignment_ratio,
-    # as a result all the centers should be reassigned and the model
-    # should no longer be good
-    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
-    for this_X in (X, X_csr):
-        mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100,
-                                     random_state=42)
-        mb_k_means.fit(this_X)
-
-        score_before = mb_k_means.score(this_X)
-        try:
-            old_stdout = sys.stdout
-            sys.stdout = StringIO()
-            # Turn on verbosity to smoke test the display code
-            _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
-                             mb_k_means.cluster_centers_,
-                             mb_k_means.counts_,
-                             np.zeros(X.shape[1], np.double),
-                             False, distances=np.zeros(X.shape[0]),
-                             random_reassign=True, random_state=42,
-                             reassignment_ratio=1, verbose=True)
-        finally:
-            sys.stdout = old_stdout
-        assert score_before > mb_k_means.score(this_X)
-
-    # Give a perfect initialization, with a small reassignment_ratio,
-    # no center should be reassigned
-    for this_X in (X, X_csr):
-        mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100,
-                                     init=centers.copy(),
-                                     random_state=42, n_init=1)
-        mb_k_means.fit(this_X)
-        clusters_before = mb_k_means.cluster_centers_
-        # Turn on verbosity to smoke test the display code
-        _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
-                         mb_k_means.cluster_centers_,
-                         mb_k_means.counts_,
-                         np.zeros(X.shape[1], np.double),
-                         False, distances=np.zeros(X.shape[0]),
-                         random_reassign=True, random_state=42,
-                         reassignment_ratio=1e-15)
-        assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_)
-
-
-def test_minibatch_with_many_reassignments():
-    # Test for the case that the number of clusters to reassign is bigger
-    # than the batch_size
-    n_samples = 550
-    rnd = np.random.RandomState(42)
-    X = rnd.uniform(size=(n_samples, 10))
-    # Check that the fit works if n_clusters is bigger than the batch_size.
-    # Run the test with 550 clusters and 550 samples, because it turned out
-    # that this values ensure that the number of clusters to reassign
-    # is always bigger than the batch_size
-    n_clusters = 550
-    MiniBatchKMeans(n_clusters=n_clusters,
-                    batch_size=100,
-                    init_size=n_samples,
-                    random_state=42).fit(X)
-
-
-def test_sparse_mb_k_means_callable_init():
-
-    def test_init(X, k, random_state):
-        return centers
-
-    # Small test to check that giving the wrong number of centers
-    # raises a meaningful error
-    msg = "does not match the number of clusters"
-    with pytest.raises(ValueError, match=msg):
-        MiniBatchKMeans(init=test_init, random_state=42).fit(X_csr)
-
-    # Now check that the fit actually works
-    mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,
-                                 random_state=42).fit(X_csr)
-    _check_fitted_model(mb_k_means)
-
-
-def test_mini_batch_k_means_random_init_partial_fit():
-    km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)
 
-    # use the partial_fit API for online learning
-    for X_minibatch in np.array_split(X, 10):
-        km.partial_fit(X_minibatch)
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_unit_weights_vs_no_weights(estimator):
+    # Check that not passing sample weights should be equivalent to passing
+    # sample weights all equal to one.
+    sample_weight = np.ones(n_samples)
 
-    # compute the labeling on the complete dataset
-    labels = km.predict(X)
-    assert v_measure_score(true_labels, labels) == 1.0
+    km = estimator(n_clusters=n_clusters, random_state=42)
+    km_none = clone(km).fit(X, sample_weight=None)
+    km_ones = clone(km).fit(X, sample_weight=sample_weight)
 
+    assert_array_equal(km_none.labels_, km_ones.labels_)
+    assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_)
 
-def test_minibatch_default_init_size():
-    mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
-                                 batch_size=10, random_state=42,
-                                 n_init=1).fit(X)
-    assert mb_k_means._init_size == 3 * mb_k_means.batch_size
-    _check_fitted_model(mb_k_means)
 
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_scaled_weights(estimator):
+    # Check that scaling all sample weights by a common factor
+    # shouldn't change the result
+    sample_weight = np.random.uniform(n_samples)
 
-def test_minibatch_tol():
-    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10,
-                                 random_state=42, tol=.01).fit(X)
-    _check_fitted_model(mb_k_means)
+    km = estimator(n_clusters=n_clusters, random_state=42)
+    km_orig = clone(km).fit(X, sample_weight=sample_weight)
+    km_scaled = clone(km).fit(X, sample_weight=0.5 * sample_weight)
 
+    assert_array_equal(km_orig.labels_, km_scaled.labels_)
+    assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_)
 
-def test_minibatch_set_init_size():
-    mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
-                                 init_size=666, random_state=42,
-                                 n_init=1).fit(X)
-    assert mb_k_means.init_size == 666
-    assert mb_k_means._init_size == n_samples
-    _check_fitted_model(mb_k_means)
 
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_fortran_aligned_data(estimator):
+    # Check that KMeans works with fortran-aligned data.
+    X_fortran = np.asfortranarray(X)
+    centers_fortran = np.asfortranarray(centers)
 
-@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
-def test_k_means_invalid_init(Estimator):
-    km = Estimator(init="invalid", n_init=1, n_clusters=n_clusters)
-    with pytest.raises(ValueError):
-        km.fit(X)
+    km_c = estimator(n_clusters=n_clusters, init=centers, n_init=1,
+                     random_state=42).fit(X)
+    km_f = estimator(n_clusters=n_clusters, init=centers_fortran, n_init=1,
+                     random_state=42).fit(X_fortran)
+    assert_allclose(km_c.cluster_centers_, km_f.cluster_centers_)
+    assert_array_equal(km_c.labels_, km_f.labels_)
 
 
 def test_k_means_copyx():
-    # Check if copy_x=False returns nearly equal X after de-centering.
+    # Check that copy_x=False returns nearly equal X after de-centering.
     my_X = X.copy()
     km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42)
     km.fit(my_X)
     _check_fitted_model(km)
 
-    # check if my_X is centered
-    assert_array_almost_equal(my_X, X)
+    # check that my_X is de-centered
+    assert_allclose(my_X, X)
 
 
-def test_k_means_non_collapsed():
-    # Check k_means with a bad initialization does not yield a singleton
-    # Starting with bad centers that are quickly ignored should not
-    # result in a repositioning of the centers to the center of mass that
-    # would lead to collapsed centers which in turns make the clustering
-    # dependent of the numerical unstabilities.
-    my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]])
-    array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]])
-    km = KMeans(init=array_init, n_clusters=3, random_state=42, n_init=1)
-    km.fit(my_X)
-
-    # centers must not been collapsed
-    assert len(np.unique(km.labels_)) == 3
-
-    centers = km.cluster_centers_
-    assert np.linalg.norm(centers[0] - centers[1]) >= 0.1
-    assert np.linalg.norm(centers[0] - centers[2]) >= 0.1
-    assert np.linalg.norm(centers[1] - centers[2]) >= 0.1
-
-
-@pytest.mark.parametrize('algo', ['full', 'elkan'])
-def test_score(algo):
-    # Check that fitting k-means with multiple inits gives better score
-    km1 = KMeans(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1,
-                 algorithm=algo)
-    s1 = km1.fit(X).score(X)
-    km2 = KMeans(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1,
-                 algorithm=algo)
-    s2 = km2.fit(X).score(X)
-    assert s2 > s1
+@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_centers_not_mutated(estimator, dtype):
+    # Check that KMeans and MiniBatchKMeans won't mutate the user provided
+    # init centers silently even if input data and init centers have the same
+    # type.
+    X_new_type = X.astype(dtype, copy=True)
+    centers_new_type = centers.astype(dtype, copy=True)
 
+    km = estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1)
+    km.fit(X_new_type)
 
-@pytest.mark.parametrize('Estimator', [KMeans, MiniBatchKMeans])
-@pytest.mark.parametrize('data', [X, X_csr], ids=['dense', 'sparse'])
-@pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()])
-def test_predict(Estimator, data, init):
-    k_means = Estimator(n_clusters=n_clusters, init=init,
-                        n_init=10, random_state=0).fit(data)
+    assert not np.may_share_memory(km.cluster_centers_, centers)
 
-    # sanity check: re-predict labeling for training set samples
-    assert_array_equal(k_means.predict(data), k_means.labels_)
 
-    # sanity check: predict centroid labels
-    pred = k_means.predict(k_means.cluster_centers_)
-    assert_array_equal(pred, np.arange(n_clusters))
-
-    # re-predict labels for training set using fit_predict
-    pred = k_means.fit_predict(data)
-    assert_array_equal(pred, k_means.labels_)
+@pytest.mark.parametrize("data", [X, X_csr], ids=["sparse", "dense"])
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_float_precision(estimator, data):
+    km = estimator(n_init=1, random_state=0)
 
+    inertia = {}
+    Xt = {}
+    centers = {}
+    labels = {}
 
-@pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()])
-def test_predict_minibatch_dense_sparse(init):
-    # check that models trained on sparse input also works for dense input at
-    # predict time
-    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init=init,
-                                 n_init=10, random_state=0).fit(X_csr)
-
-    assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)
-
-
-def test_int_input():
-    X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]
-    for dtype in [np.int32, np.int64]:
-        X_int = np.array(X_list, dtype=dtype)
-        X_int_csr = sp.csr_matrix(X_int)
-        init_int = X_int[:2]
-
-        fitted_models = [
-            KMeans(n_clusters=2).fit(X_int),
-            KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int),
-            # mini batch kmeans is very unstable on such a small dataset hence
-            # we use many inits
-            MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int),
-            MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(
-                    X_int_csr),
-            MiniBatchKMeans(n_clusters=2, batch_size=2,
-                            init=init_int, n_init=1).fit(X_int),
-            MiniBatchKMeans(n_clusters=2, batch_size=2,
-                            init=init_int, n_init=1).fit(X_int_csr),
-        ]
-
-        for km in fitted_models:
-            assert km.cluster_centers_.dtype == np.float64
-
-        expected_labels = [0, 1, 1, 0, 0, 1]
-        scores = np.array([v_measure_score(expected_labels, km.labels_)
-                           for km in fitted_models])
-        assert_array_almost_equal(scores, np.ones(scores.shape[0]))
-
-
-def test_transform():
-    km = KMeans(n_clusters=n_clusters)
-    km.fit(X)
-    X_new = km.transform(km.cluster_centers_)
-
-    for c in range(n_clusters):
-        assert X_new[c, c] == 0
-        for c2 in range(n_clusters):
-            if c != c2:
-                assert X_new[c, c2] > 0
-
-
-def test_fit_transform():
-    X1 = KMeans(n_clusters=3, random_state=51).fit(X).transform(X)
-    X2 = KMeans(n_clusters=3, random_state=51).fit_transform(X)
-    assert_array_almost_equal(X1, X2)
-
-
-@pytest.mark.parametrize('algo', ['full', 'elkan'])
-def test_predict_equal_labels(algo):
-    km = KMeans(random_state=13, n_init=1, max_iter=1,
-                algorithm=algo)
-    km.fit(X)
-    assert_array_equal(km.predict(X), km.labels_)
-
+    for dtype in [np.float64, np.float32]:
+        X = data.astype(dtype)
+        km.fit(X)
 
-def test_full_vs_elkan():
-    km1 = KMeans(algorithm='full', random_state=13).fit(X)
-    km2 = KMeans(algorithm='elkan', random_state=13).fit(X)
+        inertia[dtype] = km.inertia_
+        Xt[dtype] = km.transform(X)
+        centers[dtype] = km.cluster_centers_
+        labels[dtype] = km.labels_
 
-    assert homogeneity_score(km1.predict(X), km2.predict(X)) == 1.0
+        # dtype of cluster centers has to be the dtype of the input data
+        assert km.cluster_centers_.dtype == dtype
 
+        # same with partial_fit
+        if estimator is MiniBatchKMeans:
+            km.partial_fit(X[0:3])
+            assert km.cluster_centers_.dtype == dtype
 
-def test_n_init():
-    # Check that increasing the number of init increases the quality
-    n_runs = 5
-    n_init_range = [1, 5, 10]
-    inertia = np.zeros((len(n_init_range), n_runs))
-    for i, n_init in enumerate(n_init_range):
-        for j in range(n_runs):
-            km = KMeans(n_clusters=n_clusters, init="random", n_init=n_init,
-                        random_state=j).fit(X)
-            inertia[i, j] = km.inertia_
+    # compare arrays with low precision since the difference between
+    # 32 and 64 bit sometimes makes a difference up to the 4th decimal
+    # place
+    assert_allclose(inertia[np.float32], inertia[np.float64], rtol=1e-5)
+    assert_allclose(Xt[np.float32], Xt[np.float64], rtol=1e-5)
+    assert_allclose(centers[np.float32], centers[np.float64], rtol=1e-5)
+    assert_array_equal(labels[np.float32], labels[np.float64])
 
-    inertia = inertia.mean(axis=1)
-    failure_msg = ("Inertia %r should be decreasing"
-                   " when n_init is increasing.") % list(inertia)
-    for i in range(len(n_init_range) - 1):
-        assert inertia[i] >= inertia[i + 1], failure_msg
 
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_score_multiple_inits(estimator):
+    # Check that fitting KMeans or MiniBatchKMeans with multiple inits gives
+    # better score
+    X = np.random.RandomState(0).randn(100, 10)
 
-def test_k_means_function():
-    # test calling the k_means function directly
-    # catch output
-    old_stdout = sys.stdout
-    sys.stdout = StringIO()
-    try:
-        cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
-                                                   sample_weight=None,
-                                                   verbose=True)
-    finally:
-        sys.stdout = old_stdout
-    centers = cluster_centers
-    assert centers.shape == (n_clusters, n_features)
+    km1 = estimator(max_iter=10, random_state=42, n_init=1)
+    s1 = km1.fit(X).score(X)
+    km2 = estimator(max_iter=10, random_state=42, n_init=10)
+    s2 = km2.fit(X).score(X)
+    assert s2 > s1
 
-    labels = labels
-    assert np.unique(labels).shape[0] == n_clusters
 
-    # check that the labels assignment are perfect (up to a permutation)
-    assert v_measure_score(true_labels, labels) == 1.0
-    assert inertia > 0.0
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_score_max_iter(estimator):
+    # Check that fitting KMeans or MiniBatchKMeans with more iterations gives
+    # better score
+    X = np.random.RandomState(0).randn(100, 10)
 
-    # check warning when centers are passed
-    assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters,
-                 sample_weight=None, init=centers)
+    km1 = estimator(n_init=1, random_state=42, max_iter=1)
+    s1 = km1.fit(X).score(X)
+    km2 = estimator(n_init=1, random_state=42, max_iter=10)
+    s2 = km2.fit(X).score(X)
+    assert s2 > s1
 
-    # to many clusters desired
-    with pytest.raises(ValueError):
-        k_means(X, n_clusters=X.shape[0] + 1, sample_weight=None)
 
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("dtype", [np.int32, np.int64])
+@pytest.mark.parametrize("init", ["k-means++", "ndarray"])
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_integer_input(estimator, array_constr, dtype, init):
+    # Check that KMeans and MiniBatchKMeans work with integer input.
+    X_dense = np.array([[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]])
+    X = array_constr(X_dense, dtype=dtype)
 
-def test_x_squared_norms_init_centroids():
-    # Test that x_squared_norms can be None in _init_centroids
-    from sklearn.cluster._kmeans import _init_centroids
+    n_init = 1 if init == "ndarray" else 10
+    init = X_dense[:2] if init == "ndarray" else init
 
-    X_norms = np.sum(X**2, axis=1)
-    precompute = _init_centroids(
-        X, 3, "k-means++", random_state=0, x_squared_norms=X_norms)
-    assert_array_almost_equal(
-        precompute,
-        _init_centroids(X, 3, "k-means++", random_state=0))
+    km = estimator(n_clusters=2, init=init, n_init=n_init, random_state=0)
+    if estimator is MiniBatchKMeans:
+        km.set_params(batch_size=2)
 
+    km.fit(X)
 
-def test_max_iter_error():
-    km = KMeans(max_iter=-1)
-    assert_raise_message(ValueError, 'Number of iterations should be',
-                         km.fit, X)
+    # Internally integer input should be converted to float64
+    assert km.cluster_centers_.dtype == np.float64
 
+    expected_labels = [0, 1, 1, 0, 0, 1]
+    assert_allclose(v_measure_score(km.labels_, expected_labels), 1)
 
-@pytest.mark.parametrize('Estimator', [KMeans, MiniBatchKMeans])
-@pytest.mark.parametrize('is_sparse', [False, True])
-def test_float_precision(Estimator, is_sparse):
+    # Same with partial_fit (#14314)
+    if estimator is MiniBatchKMeans:
+        km = clone(km).partial_fit(X)
+        assert km.cluster_centers_.dtype == np.float64
 
-    estimator = Estimator(n_init=1, random_state=30)
 
-    inertia = {}
-    X_new = {}
-    centers = {}
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("init", ["random", "k-means++", "ndarray"])
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_predict(estimator, init, dtype, array_constr):
+    # Check the predict method and the equivalence between fit.predict and
+    # fit_predict.
+    if sys.platform == "darwin":
+        pytest.xfail(
+            "Known failures on MacOS, See "
+            "https://github.com/scikit-learn/scikit-learn/issues/12644")
 
-    for dtype in [np.float64, np.float32]:
-        if is_sparse:
-            X_test = sp.csr_matrix(X_csr, dtype=dtype)
-        else:
-            X_test = X.astype(dtype)
-        estimator.fit(X_test)
-        # dtype of cluster centers has to be the dtype of the input
-        # data
-        assert estimator.cluster_centers_.dtype == dtype
-        inertia[dtype] = estimator.inertia_
-        X_new[dtype] = estimator.transform(X_test)
-        centers[dtype] = estimator.cluster_centers_
-        # ensure the extracted row is a 2d array
-        assert estimator.predict(X_test[:1]) == estimator.labels_[0]
-        if hasattr(estimator, 'partial_fit'):
-            estimator.partial_fit(X_test[0:3])
-            # dtype of cluster centers has to stay the same after
-            # partial_fit
-            assert estimator.cluster_centers_.dtype == dtype
+    X, _ = make_blobs(n_samples=500, n_features=10, centers=10, random_state=0)
 
-    # compare arrays with low precision since the difference between
-    # 32 and 64 bit sometimes makes a difference up to the 4th decimal
-    # place
-    assert_array_almost_equal(inertia[np.float32], inertia[np.float64],
-                              decimal=4)
-    assert_array_almost_equal(X_new[np.float32], X_new[np.float64],
-                              decimal=4)
-    assert_array_almost_equal(centers[np.float32], centers[np.float64],
-                              decimal=4)
-
-
-def test_k_means_init_centers():
-    # This test is used to check KMeans won't mutate the user provided input
-    # array silently even if input data and init centers have the same type
-    X_small = np.array([[1.1, 1.1], [-7.5, -7.5], [-1.1, -1.1], [7.5, 7.5]])
-    init_centers = np.array([[0.0, 0.0], [5.0, 5.0], [-5.0, -5.0]])
-    for dtype in [np.int32, np.int64, np.float32, np.float64]:
-        X_test = dtype(X_small)
-        init_centers_test = dtype(init_centers)
-        assert_array_equal(init_centers, init_centers_test)
-        km = KMeans(init=init_centers_test, n_clusters=3, n_init=1)
-        km.fit(X_test)
-        assert np.may_share_memory(km.cluster_centers_,
-                                   init_centers) is False
+    n_init = 1 if init == "ndarray" else 10
+    init = X[:10] if init == "ndarray" else init
+    X = array_constr(X)
 
+    km = estimator(n_clusters=10, init=init, n_init=n_init,
+                   random_state=0).fit(X)
+    labels = km.labels_
 
-@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
-def test_k_means_init_fitted_centers(data):
-    # Get a local optimum
-    centers = KMeans(n_clusters=3).fit(X).cluster_centers_
+    # Due to randomness in the order in which chunks of data are processed when
+    # using more than one thread, there might be different rounding errors for
+    # the computation of the inertia for each init between 2 runs. This might
+    # result in a different ranking of the inits, hence a different labeling,
+    # which should still correspond to the same clustering
 
-    # Fit starting from a local optimum shouldn't change the solution
-    new_centers = KMeans(n_clusters=3, init=centers,
-                         n_init=1).fit(X).cluster_centers_
-    assert_array_almost_equal(centers, new_centers)
+    # re-predict labels for training set using predict
+    pred = km.predict(X)
+    assert_allclose(v_measure_score(pred, labels), 1)
 
+    # re-predict labels for training set using fit_predict
+    pred = km.fit_predict(X)
+    assert_allclose(v_measure_score(pred, labels), 1)
 
-def test_sparse_validate_centers():
-    from sklearn.datasets import load_iris
+    # predict centroid labels
+    pred = km.predict(km.cluster_centers_)
+    assert_allclose(v_measure_score(pred, np.arange(10)), 1)
 
-    iris = load_iris()
-    X = iris.data
 
-    # Get a local optimum
-    centers = KMeans(n_clusters=4).fit(X).cluster_centers_
+@pytest.mark.parametrize("init", ["random", "k-means++", centers],
+                         ids=["random", "k-means++", "ndarray"])
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_predict_dense_sparse(estimator, init):
+    # check that models trained on sparse input also works for dense input at
+    # predict time and vice versa.
+    km = estimator(n_clusters=n_clusters, init=init, n_init=10, random_state=0)
 
-    # Test that a ValueError is raised for validate_center_shape
-    classifier = KMeans(n_clusters=3, init=centers, n_init=1)
+    km.fit(X_csr)
+    assert_array_equal(km.predict(X), km.labels_)
 
-    msg = r"The shape of the initial centers \(\(4L?, 4L?\)\) " \
-          "does not match the number of clusters 3"
-    with pytest.raises(ValueError, match=msg):
-        classifier.fit(X)
+    km.fit(X)
+    assert_array_equal(km.predict(X_csr), km.labels_)
 
 
-def test_less_centers_than_unique_points():
-    X = np.asarray([[0, 0],
-                    [0, 1],
-                    [1, 0],
-                    [1, 0]])  # last point is duplicated
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_transform(estimator):
+    # Check the transform method
+    km = estimator(n_clusters=n_clusters).fit(X)
 
-    km = KMeans(n_clusters=4).fit(X)
+    # Transorfming cluster_centers_ should return the pairwise distances
+    # between centers
+    Xt = km.transform(km.cluster_centers_)
+    assert_allclose(Xt, pairwise_distances(km.cluster_centers_))
+    # In particular, diagonal must be 0
+    assert_array_equal(Xt.diagonal(), np.zeros(n_clusters))
 
-    # only three distinct points, so only three clusters
-    # can have points assigned to them
-    assert set(km.labels_) == set(range(3))
+    # Transorfming X should return the pairwise distances between X and the
+    # centers
+    Xt = km.transform(X)
+    assert_allclose(Xt, pairwise_distances(X, km.cluster_centers_))
 
-    # k_means should warn that fewer labels than cluster
-    # centers have been used
-    msg = ("Number of distinct clusters (3) found smaller than "
-           "n_clusters (4). Possibly due to duplicate points in X.")
-    assert_warns_message(ConvergenceWarning, msg, k_means, X,
-                         sample_weight=None, n_clusters=4)
 
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_fit_transform(estimator):
+    # Check equivalence between fit.transform and fit_transform
+    X1 = estimator(n_clusters=n_clusters, random_state=0).fit(X).transform(X)
+    X2 = estimator(n_clusters=n_clusters, random_state=0).fit_transform(X)
+    assert_allclose(X1, X2)
 
-def _sort_centers(centers):
-    return np.sort(centers, axis=0)
 
+@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
+def test_k_means_init_fitted_centers(data):
+    # Check that starting fitting from a local optimum shouldn't change the
+    # solution
+    km1 = KMeans(n_clusters=n_clusters).fit(data)
+    km2 = KMeans(n_clusters=n_clusters, init=km1.cluster_centers_,
+                 n_init=1).fit(data)
 
-def test_weighted_vs_repeated():
-    # a sample weight of N should yield the same result as an N-fold
-    # repetition of the sample
-    rng = np.random.RandomState(0)
-    sample_weight = rng.randint(1, 5, size=n_samples)
-    X_repeat = np.repeat(X, sample_weight, axis=0)
-    estimators = [KMeans(init="k-means++", n_clusters=n_clusters,
-                         random_state=42),
-                  KMeans(init="random", n_clusters=n_clusters,
-                         random_state=42),
-                  KMeans(init=centers.copy(), n_clusters=n_clusters,
-                         random_state=42),
-                  MiniBatchKMeans(n_clusters=n_clusters, batch_size=10,
-                                  random_state=42)]
-    for estimator in estimators:
-        est_weighted = clone(estimator).fit(X, sample_weight=sample_weight)
-        est_repeated = clone(estimator).fit(X_repeat)
-        repeated_labels = np.repeat(est_weighted.labels_, sample_weight)
-        assert_almost_equal(v_measure_score(est_repeated.labels_,
-                                            repeated_labels), 1.0)
-        if not isinstance(estimator, MiniBatchKMeans):
-            assert_almost_equal(_sort_centers(est_weighted.cluster_centers_),
-                                _sort_centers(est_repeated.cluster_centers_))
-
-
-def test_unit_weights_vs_no_weights():
-    # not passing any sample weights should be equivalent
-    # to all weights equal to one
-    sample_weight = np.ones(n_samples)
-    for estimator in [KMeans(n_clusters=n_clusters, random_state=42),
-                      MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]:
-        est_1 = clone(estimator).fit(X)
-        est_2 = clone(estimator).fit(X, sample_weight=sample_weight)
-        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
-        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
-                            _sort_centers(est_2.cluster_centers_))
+    assert_allclose(km1.cluster_centers_, km2.cluster_centers_)
 
 
-def test_scaled_weights():
-    # scaling all sample weights by a common factor
-    # shouldn't change the result
-    sample_weight = np.ones(n_samples)
-    for estimator in [KMeans(n_clusters=n_clusters, random_state=42),
-                      MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]:
-        est_1 = clone(estimator).fit(X)
-        est_2 = clone(estimator).fit(X, sample_weight=0.5*sample_weight)
-        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
-        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
-                            _sort_centers(est_2.cluster_centers_))
+def test_kmeans_elkan_iter_attribute():
+    # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off
+    # it's right value (#11340).
+    km = KMeans(algorithm="elkan", max_iter=1).fit(X)
+    assert km.n_iter_ == 1
 
 
-def test_sample_weight_length():
-    # check that an error is raised when passing sample weights
-    # with an incompatible shape
-    km = KMeans(n_clusters=n_clusters, random_state=42)
-    msg = r'sample_weight.shape == \(2,\), expected \(100,\)'
-    with pytest.raises(ValueError, match=msg):
-        km.fit(X, sample_weight=np.ones(2))
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("algo", ["full", "elkan"])
+def test_kmeans_relocated_clusters(array_constr, algo):
+    # check that empty clusters are relocated as expected
+    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]])
 
+    # second center too far from others points will be empty at first iter
+    init_centers = np.array([[0.5, 0.5], [3, 3]])
 
-def test_check_normalize_sample_weight():
-    from sklearn.cluster._kmeans import _check_normalize_sample_weight
-    sample_weight = None
-    checked_sample_weight = _check_normalize_sample_weight(sample_weight, X)
-    assert _num_samples(X) == _num_samples(checked_sample_weight)
-    assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
-    assert X.dtype == checked_sample_weight.dtype
+    expected_labels = [0, 0, 1, 1]
+    expected_inertia = 0.25
+    expected_centers = [[0.25, 0], [0.75, 1]]
+    expected_n_iter = 3
 
+    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
+    kmeans.fit(X)
 
-def test_iter_attribute():
-    # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off
-    # it's right value (#11340).
-    estimator = KMeans(algorithm="elkan", max_iter=1)
-    estimator.fit(np.random.rand(10, 10))
-    assert estimator.n_iter_ == 1
+    assert_array_equal(kmeans.labels_, expected_labels)
+    assert_allclose(kmeans.inertia_, expected_inertia)
+    assert_allclose(kmeans.cluster_centers_, expected_centers)
+    assert kmeans.n_iter_ == expected_n_iter
 
 
-def test_k_means_empty_cluster_relocated():
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+def test_k_means_empty_cluster_relocated(array_constr):
     # check that empty clusters are correctly relocated when using sample
     # weights (#13486)
-    X = np.array([[-1], [1]])
+    X = array_constr([[-1], [1]])
     sample_weight = [1.9, 0.1]
     init = np.array([[-1], [10]])
 
@@ -1026,109 +529,48 @@ def test_k_means_empty_cluster_relocated():
     assert_allclose(km.cluster_centers_, [[-1], [1]])
 
 
-def test_minibatch_kmeans_partial_fit_int_data():
-    # Issue GH #14314
-    X = np.array([[-1], [1]], dtype=np.int)
-    km = MiniBatchKMeans(n_clusters=2)
-    km.partial_fit(X)
-    assert km.cluster_centers_.dtype.kind == "f"
-
-
-def test_result_of_kmeans_equal_in_diff_n_threads():
-    # Check that KMeans gives the same results in parallel mode than in
-    # sequential mode.
-    rnd = np.random.RandomState(0)
-    X = rnd.normal(size=(50, 10))
-
-    with threadpool_limits(limits=1, user_api="openmp"):
-        result_1 = KMeans(
-            n_clusters=3, random_state=0).fit(X).labels_
-    with threadpool_limits(limits=2, user_api="openmp"):
-        result_2 = KMeans(
-            n_clusters=3, random_state=0).fit(X).labels_
-    assert_array_equal(result_1, result_2)
-
-
-@pytest.mark.parametrize("precompute_distances", ["auto", False, True])
-def test_precompute_distance_deprecated(precompute_distances):
-    # FIXME: remove in 0.25
-    depr_msg = ("'precompute_distances' was deprecated in version 0.23 and "
-                "will be removed in 0.25.")
-    X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
-    kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0,
-                    precompute_distances=precompute_distances)
-
-    with pytest.warns(FutureWarning, match=depr_msg):
-        kmeans.fit(X)
-
-
-@pytest.mark.parametrize("n_jobs", [None, 1])
-def test_n_jobs_deprecated(n_jobs):
-    # FIXME: remove in 0.25
-    depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed "
-                "in 0.25.")
-    X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
-    kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0,
-                    n_jobs=n_jobs)
-
-    with pytest.warns(FutureWarning, match=depr_msg):
-        kmeans.fit(X)
-
-
-def test_warning_elkan_1_cluster():
-    X, _ = make_blobs(n_samples=10, n_features=2, centers=1, random_state=0)
-    kmeans = KMeans(n_clusters=1, n_init=1, init='random', random_state=0,
-                    algorithm='elkan')
-
-    with pytest.warns(RuntimeWarning,
-                      match="algorithm='elkan' doesn't make sense for a single"
-                            " cluster"):
-        kmeans.fit(X)
-
-
-def test_error_wrong_algorithm():
-    X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
-    kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0,
-                    algorithm='wrong')
-
-    with pytest.raises(ValueError,
-                       match="Algorithm must be 'auto', 'full' or 'elkan'"):
-        kmeans.fit(X)
-
+@pytest.mark.parametrize("representation", ["dense", "sparse"])
+def test_relocate_empty_clusters(representation):
+    # test for the _relocate_empty_clusters_(dense/sparse) helpers
 
-@pytest.mark.parametrize("array_constr",
-                         [np.array, sp.csr_matrix],
-                         ids=['dense', 'sparse'])
-@pytest.mark.parametrize("algo", ['full', 'elkan'])
-def test_k_means_1_iteration(array_constr, algo):
-    # check the results after a single iteration (E-step M-step E-step) by
-    # comparing against a pure python implementation.
-    X = np.random.RandomState(0).uniform(size=(100, 5))
-    init_centers = X[:5]
-    X = array_constr(X)
+    # Synthetic dataset with 3 obvious clusters of different sizes
+    X = np.array(
+        [-10., -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1)
+    if representation == "sparse":
+        X = sp.csr_matrix(X)
+    sample_weight = np.ones(10)
 
-    def py_kmeans(X, init):
-        new_centers = init.copy()
-        labels = pairwise_distances_argmin(X, init)
-        for label in range(init.shape[0]):
-            new_centers[label] = X[labels == label].mean(axis=0)
-        labels = pairwise_distances_argmin(X, new_centers)
-        return labels, new_centers
+    # centers all initialized to the first point of X
+    centers_old = np.array([-10., -10, -10]).reshape(-1, 1)
 
-    py_labels, py_centers = py_kmeans(X, init_centers)
+    # With this initialization, all points will be assigned to the first center
+    # At this point a center in centers_new is the weighted sum of the points
+    # it contains if it's not empty, otherwise it is the same as before.
+    centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1)
+    weight_in_clusters = np.array([10., 0, 0])
+    labels = np.zeros(10, dtype=np.int32)
 
-    cy_kmeans = KMeans(n_clusters=5, n_init=1, init=init_centers,
-                       algorithm=algo, max_iter=1).fit(X)
-    cy_labels = cy_kmeans.labels_
-    cy_centers = cy_kmeans.cluster_centers_
+    if representation == "dense":
+        _relocate_empty_clusters_dense(X, sample_weight, centers_old,
+                                       centers_new, weight_in_clusters, labels)
+    else:
+        _relocate_empty_clusters_sparse(X.data, X.indices, X.indptr,
+                                        sample_weight, centers_old,
+                                        centers_new, weight_in_clusters,
+                                        labels)
 
-    assert_array_equal(py_labels, cy_labels)
-    assert_allclose(py_centers, cy_centers)
+    # The relocation scheme will take the 2 points farthest from the center and
+    # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The
+    # first center will be updated to contain the other 8 points.
+    assert_array_equal(weight_in_clusters, [8, 1, 1])
+    assert_allclose(centers_new, [[-36], [10], [9.5]])
 
 
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("squared", [True, False])
 def test_euclidean_distance(dtype, squared):
+    # Check that the _euclidean_(dense/sparse)_dense helpers produce correct
+    # results
     rng = np.random.RandomState(0)
     a_sparse = sp.random(1, 100, density=0.5, format="csr", random_state=rng,
                          dtype=dtype)
@@ -1150,6 +592,7 @@ def test_euclidean_distance(dtype, squared):
 
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 def test_inertia(dtype):
+    # Check that the _inertia_(dense/sparse) helpers produce correct results.
     rng = np.random.RandomState(0)
     X_sparse = sp.random(100, 10, density=0.5, format="csr", random_state=rng,
                          dtype=dtype)
@@ -1167,3 +610,186 @@ def test_inertia(dtype):
     assert_allclose(inertia_dense, inertia_sparse, rtol=1e-6)
     assert_allclose(inertia_dense, expected, rtol=1e-6)
     assert_allclose(inertia_sparse, expected, rtol=1e-6)
+
+
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_verbose(estimator):
+    # Check verbose mode of KMeans and MiniBatchKMeans for better coverage.
+    km = estimator(n_clusters=n_clusters, random_state=42, verbose=1)
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        km.fit(X)
+    finally:
+        sys.stdout = old_stdout
+
+
+def test_k_means_function():
+    # test calling the k_means function directly
+    cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
+                                               sample_weight=None)
+
+    assert cluster_centers.shape == (n_clusters, n_features)
+    assert np.unique(labels).shape[0] == n_clusters
+
+    # check that the labels assignment are perfect (up to a permutation)
+    assert_allclose(v_measure_score(true_labels, labels), 1.0)
+    assert inertia > 0.0
+
+
+def test_minibatch_kmeans_init_size():
+    # Check the internal _init_size attribute of MiniBatchKMeans
+
+    # default init size should be 3 * batch_size
+    km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1).fit(X)
+    assert km._init_size == 15
+
+    # if 3 * batch size < n_clusters, it should then be 3 * n_clusters
+    km = MiniBatchKMeans(n_clusters=10, batch_size=1, n_init=1).fit(X)
+    assert km._init_size == 30
+
+    # it should not be larger than n_samples
+    km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1,
+                         init_size=n_samples + 1).fit(X)
+    assert km._init_size == n_samples
+
+
+def test_minibatch_kmeans_partial_fit():
+    # Check fitting using the partial_fit API
+    km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)
+
+    for X_minibatch in np.array_split(X, 10):
+        km.partial_fit(X_minibatch)
+
+    # compute the labeling on the complete dataset
+    labels = km.predict(X)
+    assert_allclose(v_measure_score(true_labels, labels), 1.0)
+
+
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_wrong_params(estimator):
+    # Check that error are raised with clear error message when wrong values
+    # are passed for the parameters
+    with pytest.raises(ValueError, match="n_init should be > 0"):
+        estimator(n_init=0).fit(X)
+
+    with pytest.raises(ValueError, match="max_iter should be > 0"):
+        estimator(max_iter=0).fit(X)
+
+    with pytest.raises(ValueError,
+                       match=r"n_samples.* should be >= n_clusters"):
+        estimator(n_clusters=n_samples + 1).fit(X)
+
+    with pytest.raises(ValueError, match="tol should be >= 0"):
+        estimator(tol=-1).fit(X)
+
+    match = (r"The shape of the initial centers .* does not match "
+             r"the number of clusters")
+    with pytest.raises(ValueError, match=match):
+        estimator(init=X[:2]).fit(X)
+    with pytest.raises(ValueError, match=match):
+        estimator(init=lambda X_, k, random_state: X_[:2]).fit(X)
+
+    match = (r"The shape of the initial centers .* does not match "
+             r"the number of features of the data")
+    with pytest.raises(ValueError, match=match):
+        estimator(init=X[:8, :2]).fit(X)
+    with pytest.raises(ValueError, match=match):
+        estimator(init=lambda X_, k, random_state: X_[:8, :2]).fit(X)
+
+    with pytest.raises(ValueError,
+                       match=r"init should be either 'k-means\+\+', 'random', "
+                             r"a ndarray or a callable"):
+        estimator(init="wrong").fit(X)
+
+
+def test_kmeans_wrong_params():
+    # Check that error are raised with clear error message when wrong values
+    # are passed for the parameters specific to KMeans
+    with pytest.raises(ValueError,
+                       match="Algorithm must be 'auto', 'full' or 'elkan'"):
+        KMeans(algorithm="wrong").fit(X)
+
+
+def test_minibatch_kmeans_wrong_params():
+    # Check that error are raised with clear error message when wrong values
+    # are passed for the parameters specific to MiniBatchKMeans
+    with pytest.raises(ValueError, match="max_no_improvement should be >= 0"):
+        MiniBatchKMeans(max_no_improvement=-1).fit(X)
+
+    with pytest.raises(ValueError, match="batch_size should be > 0"):
+        MiniBatchKMeans(batch_size=-1).fit(X)
+
+    with pytest.raises(ValueError, match="init_size should be > 0"):
+        MiniBatchKMeans(init_size=-1).fit(X)
+
+    with pytest.raises(ValueError, match="reassignment_ratio should be >= 0"):
+        MiniBatchKMeans(reassignment_ratio=-1).fit(X)
+
+
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_warnings(estimator):
+    # Check warning messages common to KMeans and MiniBatchKMeans
+    with pytest.warns(RuntimeWarning,
+                      match="Explicit initial center position passed: "
+                            "performing only one init"):
+        estimator(init=centers, n_clusters=n_clusters).fit(X)
+
+
+def test_kmeans_warnings():
+    # Check warning messages specific to KMeans
+    with pytest.warns(RuntimeWarning,
+                      match="algorithm='elkan' doesn't make sense for a single"
+                            " cluster"):
+        KMeans(n_clusters=1, algorithm="elkan").fit(X)
+
+
+def test_kmeans_warns_less_centers_than_unique_points():
+    # Check KMeans when the number of found clusters is smaller than expected
+    X = np.asarray([[0, 0],
+                    [0, 1],
+                    [1, 0],
+                    [1, 0]])  # last point is duplicated
+    km = KMeans(n_clusters=4)
+
+    # KMeans should warn that fewer labels than cluster centers have been used
+    msg = (r"Number of distinct clusters \(3\) found smaller than "
+           r"n_clusters \(4\). Possibly due to duplicate points in X.")
+    with pytest.warns(ConvergenceWarning, match=msg):
+        km.fit(X)
+        # only three distinct points, so only three clusters
+        # can have points assigned to them
+        assert set(km.labels_) == set(range(3))
+
+
+def test_minibatch_kmeans_warnings():
+    # Check warning messages specific to MiniBatchKMeans
+    with pytest.warns(RuntimeWarning,
+                      match=r"init_size.* should be larger than n_clusters"):
+        MiniBatchKMeans(init_size=10, n_clusters=20).fit(X)
+
+
+@pytest.mark.parametrize("precompute_distances", ["auto", False, True])
+def test_precompute_distance_deprecated(precompute_distances):
+    # FIXME: remove in 0.25
+    depr_msg = ("'precompute_distances' was deprecated in version 0.23 and "
+                "will be removed in 0.25.")
+    X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
+    kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0,
+                    precompute_distances=precompute_distances)
+
+    with pytest.warns(FutureWarning, match=depr_msg):
+        kmeans.fit(X)
+
+
+@pytest.mark.parametrize("n_jobs", [None, 1])
+def test_n_jobs_deprecated(n_jobs):
+    # FIXME: remove in 0.25
+    depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed "
+                "in 0.25.")
+    X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
+    kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0,
+                    n_jobs=n_jobs)
+
+    with pytest.warns(FutureWarning, match=depr_msg):
+        kmeans.fit(X)
diff --git a/sklearn/cluster/tests/test_k_means2.py b/sklearn/cluster/tests/test_k_means2.py
new file mode 100644
index 0000000000000..4d14c41e42e0d
--- /dev/null
+++ b/sklearn/cluster/tests/test_k_means2.py
@@ -0,0 +1,190 @@
+"""Testing for K-means"""
+import sys
+
+import numpy as np
+from scipy import sparse as sp
+
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import assert_array_almost_equal
+from sklearn.utils._testing import assert_almost_equal
+
+from sklearn.utils.extmath import row_norms
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.cluster._kmeans import _labels_inertia
+from sklearn.cluster._kmeans import _mini_batch_step
+from sklearn.datasets import make_blobs
+from io import StringIO
+
+
+# non centered, sparse centers to check the
+centers = np.array([
+    [0.0, 5.0, 0.0, 0.0, 0.0],
+    [1.0, 1.0, 4.0, 0.0, 0.0],
+    [1.0, 0.0, 0.0, 5.0, 1.0],
+])
+n_samples = 100
+n_clusters, n_features = centers.shape
+X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
+                            cluster_std=1., random_state=42)
+X_csr = sp.csr_matrix(X)
+
+
+def test_minibatch_update_consistency():
+    # Check that dense and sparse minibatch update give the same results
+    rng = np.random.RandomState(42)
+    old_centers = centers + rng.normal(size=centers.shape)
+
+    new_centers = old_centers.copy()
+    new_centers_csr = old_centers.copy()
+
+    weight_sums = np.zeros(new_centers.shape[0], dtype=np.double)
+    weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double)
+
+    x_squared_norms = (X ** 2).sum(axis=1)
+    x_squared_norms_csr = row_norms(X_csr, squared=True)
+
+    buffer = np.zeros(centers.shape[1], dtype=np.double)
+    buffer_csr = np.zeros(centers.shape[1], dtype=np.double)
+
+    # extract a small minibatch
+    X_mb = X[:10]
+    X_mb_csr = X_csr[:10]
+    x_mb_squared_norms = x_squared_norms[:10]
+    x_mb_squared_norms_csr = x_squared_norms_csr[:10]
+
+    sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double)
+
+    # step 1: compute the dense minibatch update
+    old_inertia, incremental_diff = _mini_batch_step(
+        X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums,
+        buffer, 1, None, random_reassign=False)
+    assert old_inertia > 0.0
+
+    # compute the new inertia on the same batch to check that it decreased
+    labels, new_inertia = _labels_inertia(
+        X_mb, sample_weight_mb, x_mb_squared_norms, new_centers)
+    assert new_inertia > 0.0
+    assert new_inertia < old_inertia
+
+    # check that the incremental difference computation is matching the
+    # final observed value
+    effective_diff = np.sum((new_centers - old_centers) ** 2)
+    assert_almost_equal(incremental_diff, effective_diff)
+
+    # step 2: compute the sparse minibatch update
+    old_inertia_csr, incremental_diff_csr = _mini_batch_step(
+        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr,
+        weight_sums_csr, buffer_csr, 1, None, random_reassign=False)
+    assert old_inertia_csr > 0.0
+
+    # compute the new inertia on the same batch to check that it decreased
+    labels_csr, new_inertia_csr = _labels_inertia(
+        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr)
+    assert new_inertia_csr > 0.0
+    assert new_inertia_csr < old_inertia_csr
+
+    # check that the incremental difference computation is matching the
+    # final observed value
+    effective_diff = np.sum((new_centers_csr - old_centers) ** 2)
+    assert_almost_equal(incremental_diff_csr, effective_diff)
+
+    # step 3: check that sparse and dense updates lead to the same results
+    assert_array_equal(labels, labels_csr)
+    assert_array_almost_equal(new_centers, new_centers_csr)
+    assert_almost_equal(incremental_diff, incremental_diff_csr)
+    assert_almost_equal(old_inertia, old_inertia_csr)
+    assert_almost_equal(new_inertia, new_inertia_csr)
+
+
+def test_minibatch_sensible_reassign_fit():
+    # check if identical initial clusters are reassigned
+    # also a regression test for when there are more desired reassignments than
+    # samples.
+    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
+                                       cluster_std=1., random_state=42)
+    zeroed_X[::2, :] = 0
+    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
+                                 init="random")
+    mb_k_means.fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
+
+    # do the same with batch-size > X.shape[0] (regression test)
+    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
+                                 random_state=42, init="random")
+    mb_k_means.fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
+
+
+def test_minibatch_sensible_reassign_partial_fit():
+    zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5,
+                                       cluster_std=1., random_state=42)
+    zeroed_X[::2, :] = 0
+    mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random")
+    for i in range(100):
+        mb_k_means.partial_fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
+
+
+def test_minibatch_reassign():
+    # Give a perfect initialization, but a large reassignment_ratio,
+    # as a result all the centers should be reassigned and the model
+    # should no longer be good
+    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
+    for this_X in (X, X_csr):
+        mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100,
+                                     random_state=42)
+        mb_k_means.fit(this_X)
+
+        score_before = mb_k_means.score(this_X)
+        try:
+            old_stdout = sys.stdout
+            sys.stdout = StringIO()
+            # Turn on verbosity to smoke test the display code
+            _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
+                             mb_k_means.cluster_centers_,
+                             mb_k_means.counts_,
+                             np.zeros(X.shape[1], np.double),
+                             False, distances=np.zeros(X.shape[0]),
+                             random_reassign=True, random_state=42,
+                             reassignment_ratio=1, verbose=True)
+        finally:
+            sys.stdout = old_stdout
+        assert score_before > mb_k_means.score(this_X)
+
+    # Give a perfect initialization, with a small reassignment_ratio,
+    # no center should be reassigned
+    for this_X in (X, X_csr):
+        mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100,
+                                     init=centers.copy(),
+                                     random_state=42, n_init=1)
+        mb_k_means.fit(this_X)
+        clusters_before = mb_k_means.cluster_centers_
+        # Turn on verbosity to smoke test the display code
+        _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
+                         mb_k_means.cluster_centers_,
+                         mb_k_means.counts_,
+                         np.zeros(X.shape[1], np.double),
+                         False, distances=np.zeros(X.shape[0]),
+                         random_reassign=True, random_state=42,
+                         reassignment_ratio=1e-15)
+        assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_)
+
+
+def test_minibatch_with_many_reassignments():
+    # Test for the case that the number of clusters to reassign is bigger
+    # than the batch_size
+    n_samples = 550
+    rnd = np.random.RandomState(42)
+    X = rnd.uniform(size=(n_samples, 10))
+    # Check that the fit works if n_clusters is bigger than the batch_size.
+    # Run the test with 550 clusters and 550 samples, because it turned out
+    # that this values ensure that the number of clusters to reassign
+    # is always bigger than the batch_size
+    n_clusters = 550
+    MiniBatchKMeans(n_clusters=n_clusters,
+                    batch_size=100,
+                    init_size=n_samples,
+                    random_state=42).fit(X)

From ca728d5f66daa956f1ea1b4bafb56ebe30c79496 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 28 Feb 2020 10:56:25 +0100
Subject: [PATCH 03/72] wip

---
 sklearn/cluster/_kmeans.py             | 254 ++++++++++-----------
 sklearn/cluster/tests/test_k_means.py  | 291 ++++++++++++-------------
 sklearn/cluster/tests/test_k_means2.py |  17 +-
 3 files changed, 273 insertions(+), 289 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index ad9e7eab1ea2c..f9bc7b8875223 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -968,15 +968,12 @@ def fit(self, X, y=None, sample_weight=None):
         else:
             kmeans_single = _kmeans_single_elkan
 
-        # seeds for the initializations of the kmeans runs.
-        seeds = random_state.randint(np.iinfo(np.int32).max, size=self._n_init)
-
-        best_labels, best_inertia, best_centers = None, None, None
+        best_inertia = None
 
         # limit number of threads in second level of nested parallelism
         # (i.e. BLAS) to avoid oversubsciption.
         with threadpool_limits(limits=1, user_api="blas"):
-            for seed in seeds:
+            for i in range(self._n_init):
                 # Initialize centers
                 centers_init = self._init_centroids(
                     X, x_squared_norms=x_squared_norms, init=init,
@@ -988,12 +985,12 @@ def fit(self, X, y=None, sample_weight=None):
                 labels, inertia, centers, n_iter_ = kmeans_single(
                     X, sample_weight, centers_init, max_iter=self.max_iter,
                     verbose=self.verbose, tol=self._tol,
-                    x_squared_norms=x_squared_norms, random_state=seed,
+                    x_squared_norms=x_squared_norms, random_state=random_state,
                     n_threads=self._n_threads)
                 # determine if these results are the best so far
                 if best_inertia is None or inertia < best_inertia:
-                    best_labels = labels.copy()
-                    best_centers = centers.copy()
+                    best_labels = labels
+                    best_centers = centers
                     best_inertia = inertia
                     best_n_iter = n_iter_
 
@@ -1152,9 +1149,8 @@ def score(self, X, y=None, sample_weight=None):
 
 
 def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
-                     old_center_buffer, compute_squared_diff,
-                     distances, random_reassign=False,
-                     random_state=None, reassignment_ratio=.01,
+                     old_center_buffer, compute_squared_diff, random_state,
+                     random_reassign=False, reassignment_ratio=.01,
                      verbose=False):
     """Incremental update of the centers for the Minibatch K-Means algorithm.
 
@@ -1177,15 +1173,8 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
          The vector in which we keep track of the numbers of elements in a
          cluster. This array is MODIFIED IN PLACE
 
-    distances : array, dtype float, shape (n_samples), optional
-        If not None, should be a pre-allocated array that will be used to store
-        the distances of each sample to its closest center.
-        May not be None when random_reassign is True.
-
-    random_state : int, RandomState instance, default=None
-        Determines random number generation for centroid initialization and to
-        pick new clusters amongst observations with uniform probability. Use
-        an int to make the randomness deterministic.
+    random_state : RandomState instance
+        Determines random number generation for low count centers reassignment.
         See :term:`Glossary <random_state>`.
 
     random_reassign : boolean, optional
@@ -1218,11 +1207,10 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
 
     """
     # Perform label assignment to nearest centers
-    nearest_center, inertia = _labels_inertia(X, sample_weight,
-                                              x_squared_norms, centers)
+    labels, inertia = _labels_inertia(X, sample_weight,
+                                      x_squared_norms, centers)
 
     if random_reassign and reassignment_ratio > 0:
-        random_state = check_random_state(random_state)
         # Reassign clusters that have very low weight
         to_reassign = weight_sums < reassignment_ratio * weight_sums.max()
         # pick at most .5 * batch_size samples as new centers
@@ -1256,14 +1244,14 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
     if sp.issparse(X):
         return inertia, _mini_batch_update_csr(
             X, sample_weight, x_squared_norms, centers, weight_sums,
-            nearest_center, old_center_buffer, compute_squared_diff)
+            labels, old_center_buffer, compute_squared_diff)
 
     # dense variant in mostly numpy (not as memory efficient though)
     k = centers.shape[0]
     squared_diff = 0.0
     for center_idx in range(k):
         # find points from minibatch that are assigned to this center
-        center_mask = nearest_center == center_idx
+        center_mask = labels == center_idx
         wsum = sample_weight[center_mask].sum()
 
         if wsum > 0:
@@ -1313,26 +1301,24 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
         ewa_diff = centers_squared_diff
         ewa_inertia = batch_inertia
     else:
-        alpha = float(model.batch_size) * 2.0 / (n_samples + 1)
-        alpha = 1.0 if alpha > 1.0 else alpha
+        alpha = model.batch_size * 2.0 / (n_samples + 1)
+        alpha = min(alpha, 1.0)
         ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha
         ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha
 
     # Log progress to be able to monitor convergence
     if verbose:
-        progress_msg = (
-            'Minibatch iteration %d/%d:'
-            ' mean batch inertia: %f, ewa inertia: %f ' % (
-                iteration_idx + 1, n_iter, batch_inertia,
-                ewa_inertia))
+        progress_msg = (f"Minibatch iteration {iteration_idx + 1}/{n_iter}: "
+                        f"mean batch inertia: {batch_inertia}, ewa inertia: "
+                        f"{ewa_inertia}")
         print(progress_msg)
 
     # Early stopping based on absolute tolerance on squared change of
     # centers position (using EWA smoothing)
     if tol > 0.0 and ewa_diff <= tol:
         if verbose:
-            print('Converged (small centers change) at iteration %d/%d'
-                  % (iteration_idx + 1, n_iter))
+            print(f"Converged (small centers change) at iteration "
+                  f"{iteration_idx + 1}/{n_iter}")
         return True
 
     # Early stopping heuristic due to lack of improvement on smoothed inertia
@@ -1347,9 +1333,8 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
     if (model.max_no_improvement is not None
             and no_improvement >= model.max_no_improvement):
         if verbose:
-            print('Converged (lack of improvement in inertia)'
-                  ' at iteration %d/%d'
-                  % (iteration_idx + 1, n_iter))
+            print(f"Converged (lack of improvement in inertia) at iteration "
+                  f"{iteration_idx}/{n_iter}")
         return True
 
     # update the convergence context to maintain state across successive calls:
@@ -1432,7 +1417,8 @@ class MiniBatchKMeans(KMeans):
         only algorithm is initialized by running a batch KMeans on a
         random subset of the data. This needs to be larger than n_clusters.
 
-        If `None`, `init_size= 3 * batch_size`.
+        If `None`, the heuristic is `init_size = 3 * batch_size` if
+        `3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`.
 
     n_init : int, default=3
         Number of random initializations that are tried.
@@ -1458,8 +1444,8 @@ class MiniBatchKMeans(KMeans):
     inertia_ : float
         The value of the inertia criterion associated with the chosen
         partition (if compute_labels is set to True). The inertia is
-        defined as the sum of square distances of samples to their nearest
-        neighbor.
+        defined as the sum of square distances of samples to their cluster
+        center.
 
     See Also
     --------
@@ -1551,6 +1537,44 @@ def _check_params(self, X):
                 f"reassignment_ratio should be >= 0, got "
                 f"{self.reassignment_ratio} instead.")
 
+    def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms,
+                                  centers):
+        """Compute labels and inertia using mini batches.
+
+        This is slightly slower than doing everything at once but preventes
+        memory errors / segfaults.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Input data.
+
+        sample_weight : ndarray of shape (n_samples,)
+            The weights for each observation in X.
+
+        x_squared_norms : ndarray of shape (n_samples,)
+            Precomputed squared euclidean norm of each data point, to speed up
+            computations.
+
+        centers : ndarray of shape (n_clusters, n_features)
+            The cluster centers.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Cluster labels for each point.
+
+        inertia : float
+            Sum of squared distances of points to nearest cluster.
+        """
+        if self.verbose:
+            print('Computing label assignment and total inertia')
+        slices = gen_batches(X.shape[0], self.batch_size)
+        results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s],
+                                   centers) for s in slices]
+        labels, inertia = zip(*results)
+        return np.hstack(labels), np.sum(inertia)
+
     def fit(self, X, y=None, sample_weight=None):
         """Compute the centroids on X by chunking it into mini-batches.
 
@@ -1601,10 +1625,6 @@ def fit(self, X, y=None, sample_weight=None):
             # disabled
             old_center_buffer = np.zeros(0, dtype=X.dtype)
 
-        distances = np.zeros(self.batch_size, dtype=X.dtype)
-        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
-        n_iter = int(self.max_iter * n_batches)
-
         validation_indices = random_state.randint(0, n_samples,
                                                   self._init_size)
         X_valid = X[validation_indices]
@@ -1617,25 +1637,14 @@ def fit(self, X, y=None, sample_weight=None):
             if self.verbose:
                 print(f"Init {init_idx + 1}/{self._n_init} with method {init}")
 
-            weight_sums = np.zeros(self.n_clusters, dtype=sample_weight.dtype)
-
-            # TODO: once the `k_means` function works with sparse input we
-            # should refactor the following init to use it instead.
-
             # Initialize the centers using only a fraction of the data as we
-            # expect n_samples to be very large when using MiniBatchKMeans
+            # expect n_samples to be very large when using MiniBatchKMeans.
             cluster_centers = self._init_centroids(
                 X, x_squared_norms=x_squared_norms, init=init,
                 random_state=random_state, init_size=self._init_size)
 
-            # Compute the label assignment on the init dataset
-            _mini_batch_step(
-                X_valid, sample_weight_valid, x_squared_norms_valid,
-                cluster_centers, weight_sums, old_center_buffer, False,
-                distances=None, verbose=self.verbose)
-
-            # Keep only the best cluster centers across independent inits on
-            # the common validation set
+            # Keep the best cluster centers across independent inits based on
+            # inertia computed on a common validation set.
             _, inertia = _labels_inertia(X_valid, sample_weight_valid,
                                          x_squared_norms_valid,
                                          cluster_centers)
@@ -1644,33 +1653,40 @@ def fit(self, X, y=None, sample_weight=None):
                       f"{inertia}")
             if best_inertia is None or inertia < best_inertia:
                 self.cluster_centers_ = cluster_centers
-                self.counts_ = weight_sums
                 best_inertia = inertia
 
-        # Empty context to be used inplace by the convergence check routine
+        # Initialize counts
+        self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
+
+        # Empty conext to be used inplace by the convergence check routine
         convergence_context = {}
 
-        # Perform the iterative optimization until the final convergence
-        # criterion
+        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
+        n_iter = int(self.max_iter * n_batches)
+
+        # Perform the iterative optimization until convergence
         for iteration_idx in range(n_iter):
             # Sample a minibatch from the full dataset
-            minibatch_indices = random_state.randint(
-                0, n_samples, self.batch_size)
+            minibatch_indices = random_state.randint(0, n_samples,
+                                                     self.batch_size)
 
             # Perform the actual update step on the minibatch data
             batch_inertia, centers_squared_diff = _mini_batch_step(
-                X[minibatch_indices], sample_weight[minibatch_indices],
-                x_squared_norms[minibatch_indices],
-                self.cluster_centers_, self.counts_,
-                old_center_buffer, self._tol > 0.0, distances=distances,
+                X=X[minibatch_indices],
+                sample_weight=sample_weight[minibatch_indices],
+                x_squared_norms=x_squared_norms[minibatch_indices],
+                centers=self.cluster_centers_,
+                weight_sums=self._counts,
+                old_center_buffer=old_center_buffer,
+                compute_squared_diff=self._tol > 0.0,
+                random_state=random_state,
                 # Here we randomly choose whether to perform
                 # random reassignment: the choice is done as a function
                 # of the iteration index, and the minimum number of
                 # counts, in order to force this reassignment to happen
                 # every once in a while
                 random_reassign=((iteration_idx + 1)
-                                 % (10 + int(self.counts_.min())) == 0),
-                random_state=random_state,
+                                 % (10 + int(self._counts.min())) == 0),
                 reassignment_ratio=self.reassignment_ratio,
                 verbose=self.verbose)
 
@@ -1684,56 +1700,17 @@ def fit(self, X, y=None, sample_weight=None):
         self.n_iter_ = iteration_idx + 1
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = \
-                self._labels_inertia_minibatch(
-                    X, sample_weight, x_squared_norms, self.cluster_centers_)
+            self.labels_, self.inertia_ = self._labels_inertia_minibatch(
+                X, sample_weight, x_squared_norms, self.cluster_centers_)
 
         return self
 
-    def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms,
-                                  centers):
-        """Compute labels and inertia using mini batches.
-
-        This is slightly slower than doing everything at once but preventes
-        memory errors / segfaults.
-
-        Parameters
-        ----------
-        X : ndarray of shape (n_samples, n_features)
-            Input data.
-
-        sample_weight : ndarray of shape (n_samples,)
-            The weights for each observation in X.
-
-        x_squared_norms : ndarray of shape (n_samples,)
-            Precomputed squared euclidean norm of each data point, to speed up
-            computations.
-
-        centers : ndarray of shape (n_clusters, n_features)
-            The cluster centers.
-
-        Returns
-        -------
-        labels : ndarray of shape (n_samples,)
-            Cluster labels for each point.
-
-        inertia : float
-            Sum of squared distances of points to nearest cluster.
-        """
-        if self.verbose:
-            print('Computing label assignment and total inertia')
-        slices = gen_batches(X.shape[0], self.batch_size)
-        results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s],
-                                   centers) for s in slices]
-        labels, inertia = zip(*results)
-        return np.hstack(labels), np.sum(inertia)
-
     def partial_fit(self, X, y=None, sample_weight=None):
         """Update k means estimate on a single mini-batch X.
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Coordinates of the data points to cluster. It must be noted that
             X will be copied if it is not C-contiguous.
 
@@ -1748,42 +1725,47 @@ def partial_fit(self, X, y=None, sample_weight=None):
         -------
         self
         """
-
-        X = check_array(X, accept_sparse="csr", order="C",
-                        dtype=[np.float64, np.float32])
+        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
+                        order='C')
         n_samples, n_features = X.shape
-        if hasattr(self.init, '__array__'):
-            self.init = np.ascontiguousarray(self.init, dtype=X.dtype)
 
         if n_samples == 0:
             return self
 
         sample_weight = _check_normalize_sample_weight(sample_weight, X)
 
-        x_squared_norms = row_norms(X, squared=True)
         self._random_state = getattr(self, "_random_state",
                                      check_random_state(self.random_state))
-        if (not hasattr(self, 'counts_')
-                or not hasattr(self, 'cluster_centers_')):
-            # this is the first call partial_fit on this object:
+
+        # precompute squared norms of data points
+        x_squared_norms = row_norms(X, squared=True)
+
+        if not hasattr(self, 'cluster_centers_'):
+            # this is the first call partial_fit on this object
+
+            # TODO: check batch size and co may be wrong here
+            self._check_params(X)
+
+            # Validate init array
+            init = self.init
+            if hasattr(init, '__array__'):
+                init = check_array(init, dtype=X.dtype, copy=True, order='C')
+
             # initialize the cluster centers
             self.cluster_centers_ = self._init_centroids(
-                X, x_squared_norms=x_squared_norms, init=self.init,
-                random_state=self._random_state, init_size=self.init_size)
-            # TODO: should be self._init_size
-            # Should check params before
+                X, x_squared_norms=x_squared_norms, init=init,
+                random_state=self._random_state, init_size=self._init_size)
+
+            # Initialize counts
+            self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
 
-            self.counts_ = np.zeros(self.n_clusters,
-                                    dtype=sample_weight.dtype)
             random_reassign = False
-            distances = None
         else:
             # The lower the minimum count is, the more we do random
             # reassignment, however, we don't want to do random
             # reassignment too often, to allow for building up counts
             random_reassign = self._random_state.randint(
-                10 * (1 + self.counts_.min())) == 0
-            distances = np.zeros(X.shape[0], dtype=X.dtype)
+                10 * (1 + self._counts.min())) == 0
 
             # Raise error if partial_fit called on data with different number
             # of features.
@@ -1792,11 +1774,15 @@ def partial_fit(self, X, y=None, sample_weight=None):
                     f"Number of features {X.shape[1]} does not match previous "
                     f"data {self.cluster_centers_.shape[1]}.")
 
-        _mini_batch_step(X, sample_weight, x_squared_norms,
-                         self.cluster_centers_, self.counts_,
-                         np.zeros(0, dtype=X.dtype), 0,
-                         random_reassign=random_reassign, distances=distances,
+        _mini_batch_step(X,
+                         sample_weight=sample_weight,
+                         x_squared_norms=x_squared_norms,
+                         centers=self.cluster_centers_,
+                         weight_sums=self._counts,
+                         old_center_buffer=np.zeros(0, dtype=X.dtype),
+                         compute_squared_diff=False,
                          random_state=self._random_state,
+                         random_reassign=random_reassign,
                          reassignment_ratio=self.reassignment_ratio,
                          verbose=self.verbose)
 
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index fd48c7b73842c..f98418f037949 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -20,7 +20,6 @@
 from sklearn.metrics.cluster import v_measure_score
 from sklearn.cluster import KMeans, k_means
 from sklearn.cluster import MiniBatchKMeans
-from sklearn.cluster._kmeans import _labels_inertia
 from sklearn.cluster._kmeans import _mini_batch_step
 from sklearn.cluster._kmeans import _check_normalize_sample_weight
 from sklearn.cluster._k_means_fast import _relocate_empty_clusters_dense
@@ -60,104 +59,6 @@ def _check_fitted_model(km):
     assert km.inertia_ > 0.0
 
 
-@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
-                         ids=["dense", "sparse"])
-@pytest.mark.parametrize("algo", ["full", "elkan"])
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_kmeans_results(array_constr, algo, dtype):
-    # Checks that KMeans works as intended on toy dataset by comparing with
-    # expected results computed by hand.
-    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
-    sample_weight = [3, 1, 1, 3]  # will be rescaled to [1.5, 0.5, 0.5, 1.5]
-    init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)
-
-    expected_labels = [0, 0, 1, 1]
-    expected_inertia = 0.1875
-    expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
-    expected_n_iter = 2
-
-    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
-    kmeans.fit(X, sample_weight=sample_weight)
-
-    assert_array_equal(kmeans.labels_, expected_labels)
-    assert_allclose(kmeans.inertia_, expected_inertia)
-    assert_allclose(kmeans.cluster_centers_, expected_centers)
-    assert kmeans.n_iter_ == expected_n_iter
-
-
-@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
-                         ids=["dense", "sparse"])
-@pytest.mark.parametrize("algo", ["full", "elkan"])
-def test_k_means_1_iteration(array_constr, algo):
-    # check the results after a single iteration (E-step M-step E-step) by
-    # comparing against a pure python implementation.
-    X = np.random.RandomState(0).uniform(size=(100, 5))
-    init_centers = X[:5]
-    X = array_constr(X)
-
-    def py_kmeans(X, init):
-        new_centers = init.copy()
-        labels = pairwise_distances_argmin(X, init)
-        for label in range(init.shape[0]):
-            new_centers[label] = X[labels == label].mean(axis=0)
-        labels = pairwise_distances_argmin(X, new_centers)
-        return labels, new_centers
-
-    py_labels, py_centers = py_kmeans(X, init_centers)
-
-    cy_kmeans = KMeans(n_clusters=5, n_init=1, init=init_centers,
-                       algorithm=algo, max_iter=1).fit(X)
-    cy_labels = cy_kmeans.labels_
-    cy_centers = cy_kmeans.cluster_centers_
-
-    assert_array_equal(py_labels, cy_labels)
-    assert_allclose(py_centers, cy_centers)
-
-
-@pytest.mark.parametrize("distribution", ["normal", "blobs"])
-@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
-                         ids=["dense", "sparse"])
-@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8])
-def test_elkan_results(distribution, array_constr, tol):
-    # Check that results are identical between lloyd and elkan algorithms
-    rnd = np.random.RandomState(0)
-    if distribution == "normal":
-        X = rnd.normal(size=(5000, 10))
-    else:
-        X, _ = make_blobs(random_state=rnd)
-    X[X < 0] = 0
-    X = array_constr(X)
-
-    km_full = KMeans(algorithm="full", n_clusters=5,
-                     random_state=0, n_init=1, tol=tol)
-    km_elkan = KMeans(algorithm="elkan", n_clusters=5,
-                      random_state=0, n_init=1, tol=tol)
-
-    km_full.fit(X)
-    km_elkan.fit(X)
-    assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_)
-    assert_array_equal(km_elkan.labels_, km_full.labels_)
-    assert km_elkan.n_iter_ == km_full.n_iter_
-    assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6)
-
-
-@pytest.mark.parametrize("algorithm", ["full", "elkan"])
-def test_kmeans_convergence(algorithm):
-    # Check that KMeans stops when convergence is reached when tol=0. (#16075)
-    # We can only ensure that if the number of threads is not to large,
-    # otherwise the roundings errors coming from the unpredictability of
-    # the order in which chunks are processed make the convergence criterion
-    # to never be exactly 0.
-    rnd = np.random.RandomState(0)
-    X = rnd.normal(size=(5000, 10))
-
-    with threadpool_limits(limits=1, user_api="openmp"):
-        km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0,
-                    n_init=1, tol=0, max_iter=300).fit(X)
-
-    assert km.n_iter_ < 300
-
-
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 @pytest.mark.parametrize("init", ["random", "k-means++", centers,
                                   lambda X, k, random_state: centers],
@@ -171,7 +72,7 @@ def test_all_init(estimator, data, init):
 
 
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_result_of_kmeans_equal_in_diff_n_threads(estimator):
+def test_result_equal_in_diff_n_threads(estimator):
     # Check that KMeans gives the same results in parallel mode than in
     # sequential mode.
     rnd = np.random.RandomState(0)
@@ -269,17 +170,6 @@ def test_fortran_aligned_data(estimator):
     assert_array_equal(km_c.labels_, km_f.labels_)
 
 
-def test_k_means_copyx():
-    # Check that copy_x=False returns nearly equal X after de-centering.
-    my_X = X.copy()
-    km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42)
-    km.fit(my_X)
-    _check_fitted_model(km)
-
-    # check that my_X is de-centered
-    assert_allclose(my_X, X)
-
-
 @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 def test_centers_not_mutated(estimator, dtype):
@@ -471,8 +361,129 @@ def test_fit_transform(estimator):
     assert_allclose(X1, X2)
 
 
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_verbose(estimator):
+    # Check verbose mode of KMeans and MiniBatchKMeans for better coverage.
+    km = estimator(n_clusters=n_clusters, random_state=42, verbose=1)
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        km.fit(X)
+    finally:
+        sys.stdout = old_stdout
+
+
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("algo", ["full", "elkan"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_kmeans_results(array_constr, algo, dtype):
+    # Checks that KMeans works as intended on toy dataset by comparing with
+    # expected results computed by hand.
+    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
+    sample_weight = [3, 1, 1, 3]  # will be rescaled to [1.5, 0.5, 0.5, 1.5]
+    init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)
+
+    expected_labels = [0, 0, 1, 1]
+    expected_inertia = 0.1875
+    expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
+    expected_n_iter = 2
+
+    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
+    kmeans.fit(X, sample_weight=sample_weight)
+
+    assert_array_equal(kmeans.labels_, expected_labels)
+    assert_allclose(kmeans.inertia_, expected_inertia)
+    assert_allclose(kmeans.cluster_centers_, expected_centers)
+    assert kmeans.n_iter_ == expected_n_iter
+
+
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("algo", ["full", "elkan"])
+def test_k_means_1_iteration(array_constr, algo):
+    # check the results after a single iteration (E-step M-step E-step) by
+    # comparing against a pure python implementation.
+    X = np.random.RandomState(0).uniform(size=(100, 5))
+    init_centers = X[:5]
+    X = array_constr(X)
+
+    def py_kmeans(X, init):
+        new_centers = init.copy()
+        labels = pairwise_distances_argmin(X, init)
+        for label in range(init.shape[0]):
+            new_centers[label] = X[labels == label].mean(axis=0)
+        labels = pairwise_distances_argmin(X, new_centers)
+        return labels, new_centers
+
+    py_labels, py_centers = py_kmeans(X, init_centers)
+
+    cy_kmeans = KMeans(n_clusters=5, n_init=1, init=init_centers,
+                       algorithm=algo, max_iter=1).fit(X)
+    cy_labels = cy_kmeans.labels_
+    cy_centers = cy_kmeans.cluster_centers_
+
+    assert_array_equal(py_labels, cy_labels)
+    assert_allclose(py_centers, cy_centers)
+
+
+@pytest.mark.parametrize("distribution", ["normal", "blobs"])
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8])
+def test_kmeans_elkan_results(distribution, array_constr, tol):
+    # Check that results are identical between lloyd and elkan algorithms
+    rnd = np.random.RandomState(0)
+    if distribution == "normal":
+        X = rnd.normal(size=(5000, 10))
+    else:
+        X, _ = make_blobs(random_state=rnd)
+    X[X < 0] = 0
+    X = array_constr(X)
+
+    km_full = KMeans(algorithm="full", n_clusters=5,
+                     random_state=0, n_init=1, tol=tol)
+    km_elkan = KMeans(algorithm="elkan", n_clusters=5,
+                      random_state=0, n_init=1, tol=tol)
+
+    km_full.fit(X)
+    km_elkan.fit(X)
+    assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_)
+    assert_array_equal(km_elkan.labels_, km_full.labels_)
+    assert km_elkan.n_iter_ == km_full.n_iter_
+    assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6)
+
+
+@pytest.mark.parametrize("algorithm", ["full", "elkan"])
+def test_kmeans_convergence(algorithm):
+    # Check that KMeans stops when convergence is reached when tol=0. (#16075)
+    # We can only ensure that if the number of threads is not to large,
+    # otherwise the roundings errors coming from the unpredictability of
+    # the order in which chunks are processed make the convergence criterion
+    # to never be exactly 0.
+    rnd = np.random.RandomState(0)
+    X = rnd.normal(size=(5000, 10))
+
+    with threadpool_limits(limits=1, user_api="openmp"):
+        km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0,
+                    n_init=1, tol=0, max_iter=300).fit(X)
+
+    assert km.n_iter_ < 300
+
+
+def test_kmeans_copyx():
+    # Check that copy_x=False returns nearly equal X after de-centering.
+    my_X = X.copy()
+    km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42)
+    km.fit(my_X)
+    _check_fitted_model(km)
+
+    # check that my_X is de-centered
+    assert_allclose(my_X, X)
+
+
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
-def test_k_means_init_fitted_centers(data):
+def test_kmeans_init_fitted_centers(data):
     # Check that starting fitting from a local optimum shouldn't change the
     # solution
     km1 = KMeans(n_clusters=n_clusters).fit(data)
@@ -515,7 +526,7 @@ def test_kmeans_relocated_clusters(array_constr, algo):
 
 @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
                          ids=["dense", "sparse"])
-def test_k_means_empty_cluster_relocated(array_constr):
+def test_kmeans_empty_cluster_relocated(array_constr):
     # check that empty clusters are correctly relocated when using sample
     # weights (#13486)
     X = array_constr([[-1], [1]])
@@ -612,18 +623,6 @@ def test_inertia(dtype):
     assert_allclose(inertia_sparse, expected, rtol=1e-6)
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_verbose(estimator):
-    # Check verbose mode of KMeans and MiniBatchKMeans for better coverage.
-    km = estimator(n_clusters=n_clusters, random_state=42, verbose=1)
-    old_stdout = sys.stdout
-    sys.stdout = StringIO()
-    try:
-        km.fit(X)
-    finally:
-        sys.stdout = old_stdout
-
-
 def test_k_means_function():
     # test calling the k_means function directly
     cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
@@ -670,17 +669,17 @@ def test_minibatch_kmeans_partial_fit():
 def test_wrong_params(estimator):
     # Check that error are raised with clear error message when wrong values
     # are passed for the parameters
-    with pytest.raises(ValueError, match="n_init should be > 0"):
+    with pytest.raises(ValueError, match=r"n_init should be > 0"):
         estimator(n_init=0).fit(X)
 
-    with pytest.raises(ValueError, match="max_iter should be > 0"):
+    with pytest.raises(ValueError, match=r"max_iter should be > 0"):
         estimator(max_iter=0).fit(X)
 
     with pytest.raises(ValueError,
                        match=r"n_samples.* should be >= n_clusters"):
         estimator(n_clusters=n_samples + 1).fit(X)
 
-    with pytest.raises(ValueError, match="tol should be >= 0"):
+    with pytest.raises(ValueError, match=r"tol should be >= 0"):
         estimator(tol=-1).fit(X)
 
     match = (r"The shape of the initial centers .* does not match "
@@ -702,29 +701,27 @@ def test_wrong_params(estimator):
                              r"a ndarray or a callable"):
         estimator(init="wrong").fit(X)
 
+    # specific to KMeans
+    if estimator is KMeans:
+        with pytest.raises(ValueError, match=r"Algorithm must be 'auto', "
+                                             r"'full' or 'elkan'"):
+            KMeans(algorithm="wrong").fit(X)
 
-def test_kmeans_wrong_params():
-    # Check that error are raised with clear error message when wrong values
-    # are passed for the parameters specific to KMeans
-    with pytest.raises(ValueError,
-                       match="Algorithm must be 'auto', 'full' or 'elkan'"):
-        KMeans(algorithm="wrong").fit(X)
-
-
-def test_minibatch_kmeans_wrong_params():
-    # Check that error are raised with clear error message when wrong values
-    # are passed for the parameters specific to MiniBatchKMeans
-    with pytest.raises(ValueError, match="max_no_improvement should be >= 0"):
-        MiniBatchKMeans(max_no_improvement=-1).fit(X)
+    # specific to MiniBatchKMeans
+    if estimator is MiniBatchKMeans:
+        with pytest.raises(ValueError, match=r"max_no_improvement should be "
+                                             r">= 0"):
+            MiniBatchKMeans(max_no_improvement=-1).fit(X)
 
-    with pytest.raises(ValueError, match="batch_size should be > 0"):
-        MiniBatchKMeans(batch_size=-1).fit(X)
+        with pytest.raises(ValueError, match=r"batch_size should be > 0"):
+            MiniBatchKMeans(batch_size=-1).fit(X)
 
-    with pytest.raises(ValueError, match="init_size should be > 0"):
-        MiniBatchKMeans(init_size=-1).fit(X)
+        with pytest.raises(ValueError, match=r"init_size should be > 0"):
+            MiniBatchKMeans(init_size=-1).fit(X)
 
-    with pytest.raises(ValueError, match="reassignment_ratio should be >= 0"):
-        MiniBatchKMeans(reassignment_ratio=-1).fit(X)
+        with pytest.raises(ValueError, match=r"reassignment_ratio should be "
+                                             r">= 0"):
+            MiniBatchKMeans(reassignment_ratio=-1).fit(X)
 
 
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
diff --git a/sklearn/cluster/tests/test_k_means2.py b/sklearn/cluster/tests/test_k_means2.py
index 4d14c41e42e0d..7df2bf1b0efb3 100644
--- a/sklearn/cluster/tests/test_k_means2.py
+++ b/sklearn/cluster/tests/test_k_means2.py
@@ -57,7 +57,7 @@ def test_minibatch_update_consistency():
     # step 1: compute the dense minibatch update
     old_inertia, incremental_diff = _mini_batch_step(
         X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums,
-        buffer, 1, None, random_reassign=False)
+        buffer, 1, np.random.RandomState(0), random_reassign=False)
     assert old_inertia > 0.0
 
     # compute the new inertia on the same batch to check that it decreased
@@ -74,7 +74,8 @@ def test_minibatch_update_consistency():
     # step 2: compute the sparse minibatch update
     old_inertia_csr, incremental_diff_csr = _mini_batch_step(
         X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr,
-        weight_sums_csr, buffer_csr, 1, None, random_reassign=False)
+        weight_sums_csr, buffer_csr, 1, np.random.RandomState(0),
+        random_reassign=False)
     assert old_inertia_csr > 0.0
 
     # compute the new inertia on the same batch to check that it decreased
@@ -145,10 +146,10 @@ def test_minibatch_reassign():
             # Turn on verbosity to smoke test the display code
             _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
                              mb_k_means.cluster_centers_,
-                             mb_k_means.counts_,
+                             mb_k_means._counts,
                              np.zeros(X.shape[1], np.double),
-                             False, distances=np.zeros(X.shape[0]),
-                             random_reassign=True, random_state=42,
+                             False, random_state=np.random.RandomState(0),
+                             random_reassign=True,
                              reassignment_ratio=1, verbose=True)
         finally:
             sys.stdout = old_stdout
@@ -165,10 +166,10 @@ def test_minibatch_reassign():
         # Turn on verbosity to smoke test the display code
         _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
                          mb_k_means.cluster_centers_,
-                         mb_k_means.counts_,
+                         mb_k_means._counts,
                          np.zeros(X.shape[1], np.double),
-                         False, distances=np.zeros(X.shape[0]),
-                         random_reassign=True, random_state=42,
+                         False, random_state=np.random.RandomState(0),
+                         random_reassign=True,
                          reassignment_ratio=1e-15)
         assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_)
 

From b799aebf93ae549d7b9128af0849f58e2c83b215 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 2 Mar 2020 11:39:58 +0100
Subject: [PATCH 04/72] wip

---
 sklearn/cluster/_kmeans.py            | 32 ++++++++-------------------
 sklearn/cluster/tests/test_k_means.py | 16 +++-----------
 2 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index f9bc7b8875223..6c50eec0a1ee4 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -140,21 +140,6 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state,
 ###############################################################################
 # K-means batch estimation by EM (expectation maximization)
 
-def _check_normalize_sample_weight(sample_weight, X):
-    """Set sample_weight if None, and check for correct dtype"""
-
-    sample_weight_was_none = sample_weight is None
-
-    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
-    if not sample_weight_was_none:
-        # normalize the weights to sum up to n_samples
-        # an array of 1 (i.e. samples_weight is None) is already normalized
-        n_samples = len(sample_weight)
-        scale = n_samples / sample_weight.sum()
-        sample_weight *= scale
-    return sample_weight
-
-
 def k_means(X, n_clusters, sample_weight=None, init='k-means++',
             precompute_distances='deprecated', n_init=10, max_iter=300,
             verbose=False, tol=1e-4, random_state=None, copy_x=True,
@@ -691,7 +676,8 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         Labels of each point
 
     inertia_ : float
-        Sum of squared distances of samples to their closest cluster center.
+        Sum of squared distances of samples to their closest cluster center,
+        weighted by the sample weights if provided.
 
     n_iter_ : int
         Number of iterations run.
@@ -940,7 +926,7 @@ def fit(self, X, y=None, sample_weight=None):
         X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
                         order='C', copy=self.copy_x, accept_large_sparse=False)
 
-        sample_weight = _check_normalize_sample_weight(sample_weight, X)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         random_state = check_random_state(self.random_state)
 
@@ -1113,7 +1099,7 @@ def predict(self, X, sample_weight=None):
 
         X = self._check_test_data(X)
         x_squared_norms = row_norms(X, squared=True)
-        sample_weight = _check_normalize_sample_weight(sample_weight, X)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         return _labels_inertia(X, sample_weight, x_squared_norms,
                                self.cluster_centers_, self._n_threads)[0]
@@ -1142,7 +1128,7 @@ def score(self, X, y=None, sample_weight=None):
 
         X = self._check_test_data(X)
         x_squared_norms = row_norms(X, squared=True)
-        sample_weight = _check_normalize_sample_weight(sample_weight, X)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         return -_labels_inertia(X, sample_weight, x_squared_norms,
                                 self.cluster_centers_, self._n_threads)[1]
@@ -1445,7 +1431,7 @@ class MiniBatchKMeans(KMeans):
         The value of the inertia criterion associated with the chosen
         partition (if compute_labels is set to True). The inertia is
         defined as the sum of square distances of samples to their cluster
-        center.
+        center, weighted by the sample weights if provided.
 
     See Also
     --------
@@ -1601,7 +1587,7 @@ def fit(self, X, y=None, sample_weight=None):
                         order='C')
         n_samples, n_features = X.shape
 
-        sample_weight = _check_normalize_sample_weight(sample_weight, X)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         random_state = check_random_state(self.random_state)
 
@@ -1732,7 +1718,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
         if n_samples == 0:
             return self
 
-        sample_weight = _check_normalize_sample_weight(sample_weight, X)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         self._random_state = getattr(self, "_random_state",
                                      check_random_state(self.random_state))
@@ -1817,7 +1803,7 @@ def predict(self, X, sample_weight=None):
 
         X = self._check_test_data(X)
         x_squared_norms = row_norms(X, squared=True)
-        sample_weight = _check_normalize_sample_weight(sample_weight, X)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         return self._labels_inertia_minibatch(
             X, sample_weight, x_squared_norms, self.cluster_centers_)[0]
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index f98418f037949..dcc16e904cff7 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -21,7 +21,6 @@
 from sklearn.cluster import KMeans, k_means
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.cluster._kmeans import _mini_batch_step
-from sklearn.cluster._kmeans import _check_normalize_sample_weight
 from sklearn.cluster._k_means_fast import _relocate_empty_clusters_dense
 from sklearn.cluster._k_means_fast import _relocate_empty_clusters_sparse
 from sklearn.cluster._k_means_fast import _euclidean_dense_dense_wrapper
@@ -87,16 +86,6 @@ def test_result_equal_in_diff_n_threads(estimator):
     assert_array_equal(result_1, result_2)
 
 
-def test_check_normalize_sample_weight():
-    # Check the check sample weight helper. sample weights should sum to
-    # n_samples
-    sample_weight = None
-    checked_sample_weight = _check_normalize_sample_weight(sample_weight, X)
-    assert _num_samples(X) == _num_samples(checked_sample_weight)
-    assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
-    assert X.dtype == checked_sample_weight.dtype
-
-
 def _sort_centers(centers):
     return np.sort(centers, axis=0)
 
@@ -124,6 +113,7 @@ def test_weighted_vs_repeated(estimator, init):
 
     # TODO: FIXME
     if estimator is not MiniBatchKMeans:
+        assert_allclose(km_weighted.inertia_, km_repeated.inertia_)
         assert_allclose(_sort_centers(km_weighted.cluster_centers_),
                         _sort_centers(km_repeated.cluster_centers_))
 
@@ -381,11 +371,11 @@ def test_kmeans_results(array_constr, algo, dtype):
     # Checks that KMeans works as intended on toy dataset by comparing with
     # expected results computed by hand.
     X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
-    sample_weight = [3, 1, 1, 3]  # will be rescaled to [1.5, 0.5, 0.5, 1.5]
+    sample_weight = [3, 1, 1, 3]
     init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)
 
     expected_labels = [0, 0, 1, 1]
-    expected_inertia = 0.1875
+    expected_inertia = 0.375
     expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
     expected_n_iter = 2
 

From b5b46f407d7b2ddf82cce086688e726a8452ed6d Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Tue, 3 Mar 2020 22:55:52 +0100
Subject: [PATCH 05/72] wip

---
 sklearn/cluster/_k_means_fast.pyx | 96 +++++++++++++++++++++++++++++++
 sklearn/cluster/_kmeans.py        | 57 ++++++++++++++----
 2 files changed, 143 insertions(+), 10 deletions(-)

diff --git a/sklearn/cluster/_k_means_fast.pyx b/sklearn/cluster/_k_means_fast.pyx
index 8221b2b15e356..7e9024452e5fd 100644
--- a/sklearn/cluster/_k_means_fast.pyx
+++ b/sklearn/cluster/_k_means_fast.pyx
@@ -16,6 +16,7 @@ import numpy as np
 cimport numpy as np
 cimport cython
 from cython cimport floating
+from cython.parallel cimport prange
 from libc.math cimport sqrt
 
 from ..utils.extmath import row_norms
@@ -384,3 +385,98 @@ def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] sample_weight,
                                      - centers[center_idx, feature_idx]) ** 2
 
     return squared_diff
+
+
+def _minibatch_update_dense4(np.ndarray[floating, ndim=2, mode='c'] X,
+                            floating[::1] sample_weight,
+                            floating[:, ::1] centers,
+                            floating[::1] weight_sums,
+                            int[::1] labels,
+                            floating[::1] old_center,
+                            bint compute_squared_diff):
+    cdef:
+        floating squared_diff = 0
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int i, j, label
+        floating weight_sum, tmp, lr
+
+    # for i in prange(n_samples, nogil=True):
+    for i in range(n_samples):
+        label = labels[i]
+
+        # update center weight
+        weight_sum = weight_sums[label] + sample_weight[i]
+
+        # learning rate
+        if weight_sum > 0:
+            lr = 1 / weight_sum
+
+            if compute_squared_diff:
+                for j in range(n_features):
+                    old_center[j] = centers[label, j]
+
+            for j in range(n_features):
+                centers[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j]
+
+            if compute_squared_diff:
+                for j in range(n_features):
+                    tmp = centers[label, j] - old_center[j]
+                    squared_diff += tmp * tmp
+    
+        weight_sums[label] = weight_sum
+    
+    return squared_diff
+
+
+def _minibatch_update_dense(np.ndarray[floating, ndim=2, mode='c'] X,
+                             floating[::1] sample_weight,
+                             floating[:, ::1] centers,
+                             floating[::1] weight_sums,
+                             int[::1] labels,
+                             floating[::1] old_center,
+                             bint compute_squared_diff):
+    cdef:
+        floating squared_diff = 0
+        int n_clusters = centers.shape[0]
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int i, j, k
+        floating wsum, alpha, tmp
+    
+    with nogil:
+        for i in range(n_clusters):
+            wsum = 0
+            for j in prange(n_samples):
+                if labels[j] == i:
+                    wsum += sample_weight[j]
+
+            if wsum > 0:
+                if compute_squared_diff:
+                    for k in prange(n_features):
+                        old_center[k] = centers[i, k]
+
+                # inplace remove previous count scaling
+                for k in prange(n_features):
+                    centers[i, k] = centers[i, k] * weight_sums[i]
+                
+                for j in range(n_samples):
+                    if labels[j] == i:
+                        for k in range(n_features):
+                            centers[i, k] = centers[i, k] + X[j, k] * sample_weight[j]
+
+                # update the count statistics for this center
+                weight_sums[i] = weight_sums[i] + wsum
+
+                # inplace rescale to compute mean of all points (old and new)
+                alpha = 1 / weight_sums[i]
+                for k in prange(n_features):
+                    centers[i, k] = centers[i, k] * alpha
+
+                # update the squared diff if necessary
+                if compute_squared_diff:
+                    for k in prange(n_features):
+                        tmp = centers[i, k] - old_center[k]
+                        squared_diff += tmp * tmp
+    
+    return squared_diff
\ No newline at end of file
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 6c50eec0a1ee4..87141eed933c5 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -31,6 +31,7 @@
 from ._k_means_fast import _inertia_dense
 from ._k_means_fast import _inertia_sparse
 from ._k_means_fast import _mini_batch_update_csr
+from ._k_means_fast import _minibatch_update_dense
 from ._k_means_lloyd import _lloyd_iter_chunked_dense
 from ._k_means_lloyd import _lloyd_iter_chunked_sparse
 from ._k_means_elkan import _init_bounds_dense
@@ -1210,8 +1211,8 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
             new_centers = random_state.choice(X.shape[0], replace=False,
                                               size=n_reassigns)
             if verbose:
-                print("[MiniBatchKMeans] Reassigning %i cluster centers."
-                      % n_reassigns)
+                print(f"[MiniBatchKMeans] Reassigning {n_reassigns} "
+                      f"cluster centers.")
 
             if sp.issparse(X) and not sp.issparse(centers):
                 assign_rows_csr(
@@ -1232,10 +1233,17 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
             X, sample_weight, x_squared_norms, centers, weight_sums,
             labels, old_center_buffer, compute_squared_diff)
 
-    # dense variant in mostly numpy (not as memory efficient though)
-    k = centers.shape[0]
+    # dense variant in mostly numpy (not as memory efficient though.
+    else:
+        return inertia, _minibatch_update_dense(
+            X, sample_weight, centers, weight_sums, labels,
+            old_center_buffer, compute_squared_diff)
+
+
+def _minibatch_update_dense2(X, sample_weight, centers, weight_sums, labels,
+                             old_center_buffer, compute_squared_diff):
     squared_diff = 0.0
-    for center_idx in range(k):
+    for center_idx in range(centers.shape[0]):
         # find points from minibatch that are assigned to this center
         center_mask = labels == center_idx
         wsum = sample_weight[center_mask].sum()
@@ -1264,8 +1272,34 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
             if compute_squared_diff:
                 diff = centers[center_idx].ravel() - old_center_buffer.ravel()
                 squared_diff += np.dot(diff, diff)
+    
+    return squared_diff
 
-    return inertia, squared_diff
+
+def _minibatch_update_dense3(X, sample_weight, centers, weight_sums, labels,
+                            old_center_buffer, compute_squared_diff):
+    squared_diff = 0.0
+    for i in range(X.shape[0]):
+        label = labels[i]
+
+        # update center weight
+        weight_sums[label] += sample_weight[i]
+
+        # learning rate
+        if weight_sums[label] > 0:
+            lr = 1 / weight_sums[label]
+
+            if compute_squared_diff:
+                old_center_buffer[:] = centers[label]
+
+            centers[label] *= (1 - lr)
+            centers[label] += lr * X[i]
+
+            if compute_squared_diff:
+                diff = centers[label].ravel() - old_center_buffer.ravel()
+                squared_diff += np.dot(diff, diff)
+    
+    return squared_diff
 
 
 def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
@@ -1433,6 +1467,9 @@ class MiniBatchKMeans(KMeans):
         defined as the sum of square distances of samples to their cluster
         center, weighted by the sample weights if provided.
 
+    n_iter_ : int
+        Number of iterations run.
+
     See Also
     --------
     KMeans
@@ -1513,7 +1550,8 @@ def _check_params(self, X):
         elif self._init_size < self.n_clusters:
             warnings.warn(
                 f"init_size={self._init_size} should be larger than "
-                f"n_clusters={self.n_clusters}. Setting it to 3*n_clusters",
+                f"n_clusters={self.n_clusters}. Setting it to "
+                f"min(3*n_clusters, n_samples)",
                 RuntimeWarning, stacklevel=2)
             self._init_size = 3 * self.n_clusters
         self._init_size = min(self._init_size, X.shape[0])
@@ -1713,9 +1751,8 @@ def partial_fit(self, X, y=None, sample_weight=None):
         """
         X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
                         order='C')
-        n_samples, n_features = X.shape
 
-        if n_samples == 0:
+        if X.shape[0] == 0:
             return self
 
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
@@ -1729,7 +1766,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
         if not hasattr(self, 'cluster_centers_'):
             # this is the first call partial_fit on this object
 
-            # TODO: check batch size and co may be wrong here
+            # TODO: should we disable checks of unused params ?
             self._check_params(X)
 
             # Validate init array

From 6fb23335b4ad8a421e411e7300bc3713d5271933 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 5 Mar 2020 15:43:03 +0100
Subject: [PATCH 06/72] wip

---
 sklearn/cluster/_k_means_fast.pyx  | 164 +++++++++++++++++------------
 sklearn/cluster/_k_means_lloyd.pyx |   2 +-
 sklearn/cluster/_kmeans.py         |  81 +++++++-------
 3 files changed, 138 insertions(+), 109 deletions(-)

diff --git a/sklearn/cluster/_k_means_fast.pyx b/sklearn/cluster/_k_means_fast.pyx
index 7e9024452e5fd..f9ba2245d43c9 100644
--- a/sklearn/cluster/_k_means_fast.pyx
+++ b/sklearn/cluster/_k_means_fast.pyx
@@ -16,8 +16,9 @@ import numpy as np
 cimport numpy as np
 cimport cython
 from cython cimport floating
-from cython.parallel cimport prange
+from cython.parallel cimport parallel, prange
 from libc.math cimport sqrt
+from libc.stdlib cimport malloc, free
 
 from ..utils.extmath import row_norms
 
@@ -401,82 +402,109 @@ def _minibatch_update_dense4(np.ndarray[floating, ndim=2, mode='c'] X,
         int i, j, label
         floating weight_sum, tmp, lr
 
-    # for i in prange(n_samples, nogil=True):
-    for i in range(n_samples):
-        label = labels[i]
-
-        # update center weight
-        weight_sum = weight_sums[label] + sample_weight[i]
+    with nogil:
+        # for i in prange(n_samples, nogil=True):
+        for i in range(n_samples):
+            label = labels[i]
 
-        # learning rate
-        if weight_sum > 0:
-            lr = 1 / weight_sum
+            # update center weight
+            weight_sum = weight_sums[label] + sample_weight[i]
 
-            if compute_squared_diff:
-                for j in range(n_features):
-                    old_center[j] = centers[label, j]
+            # learning rate
+            if weight_sum > 0:
+                lr = 1 / weight_sum
 
-            for j in range(n_features):
-                centers[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j]
+                if compute_squared_diff:
+                    for j in range(n_features):
+                        old_center[j] = centers[label, j]
 
-            if compute_squared_diff:
                 for j in range(n_features):
-                    tmp = centers[label, j] - old_center[j]
-                    squared_diff += tmp * tmp
-    
-        weight_sums[label] = weight_sum
-    
+                    centers[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j]
+
+                if compute_squared_diff:
+                    for j in range(n_features):
+                        tmp = centers[label, j] - old_center[j]
+                        squared_diff += tmp * tmp
+        
+            weight_sums[label] = weight_sum
+        
     return squared_diff
 
 
-def _minibatch_update_dense(np.ndarray[floating, ndim=2, mode='c'] X,
-                             floating[::1] sample_weight,
-                             floating[:, ::1] centers,
-                             floating[::1] weight_sums,
-                             int[::1] labels,
-                             floating[::1] old_center,
-                             bint compute_squared_diff):
+def _minibatch_update_dense(
+        np.ndarray[floating, ndim=2, mode='c'] X,
+        floating[::1] sample_weight,
+        floating[:, ::1] centers,
+        floating[:, ::1] centers_new,
+        floating[::1] weight_sums,
+        int[::1] labels):
+    """"""
     cdef:
-        floating squared_diff = 0
-        int n_clusters = centers.shape[0]
         int n_samples = X.shape[0]
-        int n_features = X.shape[1]
-        int i, j, k
-        floating wsum, alpha, tmp
-    
-    with nogil:
-        for i in range(n_clusters):
-            wsum = 0
-            for j in prange(n_samples):
-                if labels[j] == i:
-                    wsum += sample_weight[j]
+        int n_clusters = centers.shape[0]
+        int i
 
-            if wsum > 0:
-                if compute_squared_diff:
-                    for k in prange(n_features):
-                        old_center[k] = centers[i, k]
-
-                # inplace remove previous count scaling
-                for k in prange(n_features):
-                    centers[i, k] = centers[i, k] * weight_sums[i]
-                
-                for j in range(n_samples):
-                    if labels[j] == i:
-                        for k in range(n_features):
-                            centers[i, k] = centers[i, k] + X[j, k] * sample_weight[j]
-
-                # update the count statistics for this center
-                weight_sums[i] = weight_sums[i] + wsum
-
-                # inplace rescale to compute mean of all points (old and new)
-                alpha = 1 / weight_sums[i]
-                for k in prange(n_features):
-                    centers[i, k] = centers[i, k] * alpha
-
-                # update the squared diff if necessary
-                if compute_squared_diff:
-                    for k in prange(n_features):
-                        tmp = centers[i, k] - old_center[k]
-                        squared_diff += tmp * tmp
+        int *indices
     
-    return squared_diff
\ No newline at end of file
+    with nogil, parallel():
+        indices = <int*> malloc(n_samples * sizeof(int))
+
+        for i in prange(n_clusters):
+            update_cluster(i, &X[0, 0], centers, centers_new, labels,
+                           sample_weight, weight_sums, indices)
+        
+        free(indices)
+
+
+cdef void update_cluster(
+        int i,
+        floating *X,
+        floating[:, ::1] centers,
+        floating[:, ::1] centers_new,
+        int[::1] labels,
+        floating[::1] sample_weight,
+        floating[::1] weight_sums,
+        int *indices) nogil:
+    """"""
+    cdef:
+        int n_samples = sample_weight.shape[0]
+        int n_features = centers.shape[1]
+        floating alpha, tmp
+        int n_indices
+        int j, k, idx
+
+        floating wsum = 0
+
+    # indices = np.where(labels == i)
+    k = 0
+    for j in range(n_samples):
+        if labels[j] == i:
+            indices[k] = j
+            k += 1
+    n_indices = k
+
+    for j in range(n_indices):
+        idx = indices[j]
+        wsum += sample_weight[idx]
+
+    if wsum > 0:
+        # inplace remove previous count scaling
+        for k in range(n_features):
+            centers_new[i, k] = centers[i, k] * weight_sums[i]
+
+        # update cluster with new point members
+        for j in range(n_indices):
+            idx = indices[j]
+            for k in range(n_features):
+                centers_new[i, k] += X[idx * n_features + k] * sample_weight[idx]
+
+        # update the count statistics for this center
+        weight_sums[i] += wsum
+
+        # inplace rescale to compute mean of all points (old and new)
+        alpha = 1 / weight_sums[i]
+        for k in range(n_features):
+            centers_new[i, k] *= alpha
+    else:
+        for k in range(n_features):
+            centers_new[i, k] = centers[i, k]
diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
index 93e2c6f0b9c89..747c841f6fe11 100644
--- a/sklearn/cluster/_k_means_lloyd.pyx
+++ b/sklearn/cluster/_k_means_lloyd.pyx
@@ -11,7 +11,7 @@ cimport numpy as np
 from cython cimport floating
 from cython.parallel import prange, parallel
 from libc.stdlib cimport malloc, calloc, free
-from libc.string cimport memset, memcpy
+from libc.string cimport memset
 from libc.float cimport DBL_MAX, FLT_MAX
 
 from ..utils.extmath import row_norms
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 87141eed933c5..623643e61f511 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -32,6 +32,7 @@
 from ._k_means_fast import _inertia_sparse
 from ._k_means_fast import _mini_batch_update_csr
 from ._k_means_fast import _minibatch_update_dense
+from ._k_means_fast import _minibatch_update_dense4
 from ._k_means_lloyd import _lloyd_iter_chunked_dense
 from ._k_means_lloyd import _lloyd_iter_chunked_sparse
 from ._k_means_elkan import _init_bounds_dense
@@ -1136,7 +1137,7 @@ def score(self, X, y=None, sample_weight=None):
 
 
 def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
-                     old_center_buffer, compute_squared_diff, random_state,
+                     centers_new, compute_squared_diff, random_state,
                      random_reassign=False, reassignment_ratio=.01,
                      verbose=False):
     """Incremental update of the centers for the Minibatch K-Means algorithm.
@@ -1197,6 +1198,18 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
     labels, inertia = _labels_inertia(X, sample_weight,
                                       x_squared_norms, centers)
 
+    # implementation for the sparse CSR representation completely written in
+    # cython
+    if sp.issparse(X):
+        _mini_batch_update_csr(
+            X, sample_weight, x_squared_norms, centers, weight_sums,
+            labels, centers_new, compute_squared_diff)
+
+    # dense variant in mostly numpy (not as memory efficient though.
+    else:
+        _minibatch_update_dense(
+            X, sample_weight, centers, centers_new, weight_sums, labels)
+
     if random_reassign and reassignment_ratio > 0:
         # Reassign clusters that have very low weight
         to_reassign = weight_sums < reassignment_ratio * weight_sums.max()
@@ -1208,6 +1221,7 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
         n_reassigns = to_reassign.sum()
         if n_reassigns:
             # Pick new clusters amongst observations with uniform probability
+            # TODO proba ~ distance like kmeans++ ?
             new_centers = random_state.choice(X.shape[0], replace=False,
                                               size=n_reassigns)
             if verbose:
@@ -1220,63 +1234,42 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
                         np.where(to_reassign)[0].astype(np.intp, copy=False),
                         centers)
             else:
-                centers[to_reassign] = X[new_centers]
+                centers_new[to_reassign] = X[new_centers]
         # reset counts of reassigned centers, but don't reset them too small
         # to avoid instant reassignment. This is a pretty dirty hack as it
         # also modifies the learning rates.
         weight_sums[to_reassign] = np.min(weight_sums[~to_reassign])
 
-    # implementation for the sparse CSR representation completely written in
-    # cython
-    if sp.issparse(X):
-        return inertia, _mini_batch_update_csr(
-            X, sample_weight, x_squared_norms, centers, weight_sums,
-            labels, old_center_buffer, compute_squared_diff)
-
-    # dense variant in mostly numpy (not as memory efficient though.
-    else:
-        return inertia, _minibatch_update_dense(
-            X, sample_weight, centers, weight_sums, labels,
-            old_center_buffer, compute_squared_diff)
+    return inertia, None
 
 
-def _minibatch_update_dense2(X, sample_weight, centers, weight_sums, labels,
-                             old_center_buffer, compute_squared_diff):
-    squared_diff = 0.0
-    for center_idx in range(centers.shape[0]):
+def _minibatch_update_dense3(X, sample_weight, centers, centers_new,
+                             weight_sums, labels):
+    for i in range(centers.shape[0]):
         # find points from minibatch that are assigned to this center
-        center_mask = labels == center_idx
-        wsum = sample_weight[center_mask].sum()
+        mask = labels == i
+        wsum = sample_weight[mask].sum()
 
         if wsum > 0:
-            if compute_squared_diff:
-                old_center_buffer[:] = centers[center_idx]
-
             # inplace remove previous count scaling
-            centers[center_idx] *= weight_sums[center_idx]
+            centers_new[i] = centers[i] * weight_sums[i]
 
             # inplace sum with new points members of this cluster
-            centers[center_idx] += \
-                np.sum(X[center_mask] *
-                       sample_weight[center_mask, np.newaxis], axis=0)
+            centers_new[i] += np.sum(
+                X[mask] * sample_weight[mask, np.newaxis], axis=0)
 
             # update the count statistics for this center
-            weight_sums[center_idx] += wsum
+            weight_sums[i] += wsum
 
             # inplace rescale to compute mean of all points (old and new)
             # Note: numpy >= 1.10 does not support '/=' for the following
             # expression for a mixture of int and float (see numpy issue #6464)
-            centers[center_idx] = centers[center_idx] / weight_sums[center_idx]
-
-            # update the squared diff if necessary
-            if compute_squared_diff:
-                diff = centers[center_idx].ravel() - old_center_buffer.ravel()
-                squared_diff += np.dot(diff, diff)
-    
-    return squared_diff
+            centers_new[i] /= weight_sums[i]
+        else:
+            centers_new[i] = centers[i]
 
 
-def _minibatch_update_dense3(X, sample_weight, centers, weight_sums, labels,
+def _minibatch_update_dense2(X, sample_weight, centers, weight_sums, labels,
                             old_center_buffer, compute_squared_diff):
     squared_diff = 0.0
     for i in range(X.shape[0]):
@@ -1676,9 +1669,12 @@ def fit(self, X, y=None, sample_weight=None):
                 print(f"Inertia for init {init_idx + 1}/{self._n_init}: "
                       f"{inertia}")
             if best_inertia is None or inertia < best_inertia:
-                self.cluster_centers_ = cluster_centers
+                init_centers = cluster_centers
                 best_inertia = inertia
 
+        centers = init_centers
+        centers_new = np.empty_like(centers)
+
         # Initialize counts
         self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
 
@@ -1699,9 +1695,9 @@ def fit(self, X, y=None, sample_weight=None):
                 X=X[minibatch_indices],
                 sample_weight=sample_weight[minibatch_indices],
                 x_squared_norms=x_squared_norms[minibatch_indices],
-                centers=self.cluster_centers_,
+                centers=centers,
                 weight_sums=self._counts,
-                old_center_buffer=old_center_buffer,
+                centers_new=centers_new,
                 compute_squared_diff=self._tol > 0.0,
                 random_state=random_state,
                 # Here we randomly choose whether to perform
@@ -1714,6 +1710,9 @@ def fit(self, X, y=None, sample_weight=None):
                 reassignment_ratio=self.reassignment_ratio,
                 verbose=self.verbose)
 
+            centers_squared_diff = np.sum((centers_new - centers)**2)
+            centers, centers_new = centers_new, centers
+
             # Monitor convergence and do early stopping if necessary
             if _mini_batch_convergence(
                     self, iteration_idx, n_iter, self._tol, n_samples,
@@ -1721,6 +1720,8 @@ def fit(self, X, y=None, sample_weight=None):
                     verbose=self.verbose):
                 break
 
+        self.cluster_centers_ = centers
+
         self.n_iter_ = iteration_idx + 1
 
         if self.compute_labels:

From bcaa02255dc7e673a2dd637aa3d829c01928403e Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 6 Mar 2020 14:50:39 +0100
Subject: [PATCH 07/72] wip

---
 ...{_k_means_fast.pxd => _k_means_common.pxd} |   0
 ...{_k_means_fast.pyx => _k_means_common.pyx} | 230 ------------
 sklearn/cluster/_k_means_elkan.pyx            |  12 +-
 sklearn/cluster/_k_means_lloyd.pyx            |   6 +-
 sklearn/cluster/_k_means_minibatch.pyx        | 255 ++++++++++++++
 sklearn/cluster/_kmeans.py                    | 327 ++++++++----------
 sklearn/cluster/setup.py                      |   9 +-
 sklearn/cluster/tests/test_k_means.py         |  27 +-
 8 files changed, 428 insertions(+), 438 deletions(-)
 rename sklearn/cluster/{_k_means_fast.pxd => _k_means_common.pxd} (100%)
 rename sklearn/cluster/{_k_means_fast.pyx => _k_means_common.pyx} (53%)
 create mode 100644 sklearn/cluster/_k_means_minibatch.pyx

diff --git a/sklearn/cluster/_k_means_fast.pxd b/sklearn/cluster/_k_means_common.pxd
similarity index 100%
rename from sklearn/cluster/_k_means_fast.pxd
rename to sklearn/cluster/_k_means_common.pxd
diff --git a/sklearn/cluster/_k_means_fast.pyx b/sklearn/cluster/_k_means_common.pyx
similarity index 53%
rename from sklearn/cluster/_k_means_fast.pyx
rename to sklearn/cluster/_k_means_common.pyx
index f9ba2245d43c9..38276a0baa50f 100644
--- a/sklearn/cluster/_k_means_fast.pyx
+++ b/sklearn/cluster/_k_means_common.pyx
@@ -14,11 +14,8 @@
 
 import numpy as np
 cimport numpy as np
-cimport cython
 from cython cimport floating
-from cython.parallel cimport parallel, prange
 from libc.math cimport sqrt
-from libc.stdlib cimport malloc, free
 
 from ..utils.extmath import row_norms
 
@@ -26,10 +23,6 @@ from ..utils.extmath import row_norms
 np.import_array()
 
 
-ctypedef np.float64_t DOUBLE
-ctypedef np.int32_t INT
-
-
 cdef floating _euclidean_dense_dense(
         floating* a,  # IN
         floating* b,  # IN
@@ -285,226 +278,3 @@ cdef void _center_shift(
     for j in range(n_clusters):
         center_shift[j] = _euclidean_dense_dense(
             &centers_new[j, 0], &centers_old[j, 0], n_features, False)
-
-
-def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] sample_weight,
-                           np.ndarray[floating, ndim=1] x_squared_norms,
-                           np.ndarray[floating, ndim=2] centers,
-                           np.ndarray[floating, ndim=1] weight_sums,
-                           np.ndarray[INT, ndim=1] nearest_center,
-                           np.ndarray[floating, ndim=1] old_center,
-                           int compute_squared_diff):
-    """Incremental update of the centers for sparse MiniBatchKMeans.
-
-    Parameters
-    ----------
-
-    X : CSR matrix, dtype float
-        The complete (pre allocated) training set as a CSR matrix.
-
-    centers : array, shape (n_clusters, n_features)
-        The cluster centers
-
-    counts : array, shape (n_clusters,)
-         The vector in which we keep track of the numbers of elements in a
-         cluster
-
-    Returns
-    -------
-    inertia : float
-        The inertia of the batch prior to centers update, i.e. the sum
-        of squared distances to the closest center for each sample. This
-        is the objective function being minimized by the k-means algorithm.
-
-    squared_diff : float
-        The sum of squared update (squared norm of the centers position
-        change). If compute_squared_diff is 0, this computation is skipped and
-        0.0 is returned instead.
-
-    Both squared diff and inertia are commonly used to monitor the convergence
-    of the algorithm.
-    """
-    cdef:
-        np.ndarray[floating, ndim=1] X_data = X.data
-        np.ndarray[int, ndim=1] X_indices = X.indices
-        np.ndarray[int, ndim=1] X_indptr = X.indptr
-        unsigned int n_samples = X.shape[0]
-        unsigned int n_clusters = centers.shape[0]
-        unsigned int n_features = centers.shape[1]
-
-        unsigned int sample_idx, center_idx, feature_idx
-        unsigned int k
-        DOUBLE old_weight_sum, new_weight_sum
-        DOUBLE center_diff
-        DOUBLE squared_diff = 0.0
-
-    # move centers to the mean of both old and newly assigned samples
-    for center_idx in range(n_clusters):
-        old_weight_sum = weight_sums[center_idx]
-        new_weight_sum = old_weight_sum
-
-        # count the number of samples assigned to this center
-        for sample_idx in range(n_samples):
-            if nearest_center[sample_idx] == center_idx:
-                new_weight_sum += sample_weight[sample_idx]
-
-        if new_weight_sum == old_weight_sum:
-            # no new sample: leave this center as it stands
-            continue
-
-        # rescale the old center to reflect it previous accumulated weight
-        # with regards to the new data that will be incrementally contributed
-        if compute_squared_diff:
-            old_center[:] = centers[center_idx]
-        centers[center_idx] *= old_weight_sum
-
-        # iterate of over samples assigned to this cluster to move the center
-        # location by inplace summation
-        for sample_idx in range(n_samples):
-            if nearest_center[sample_idx] != center_idx:
-                continue
-
-            # inplace sum with new samples that are members of this cluster
-            # and update of the incremental squared difference update of the
-            # center position
-            for k in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
-                centers[center_idx, X_indices[k]] += X_data[k]
-
-        # inplace rescale center with updated count
-        if new_weight_sum > old_weight_sum:
-            # update the count statistics for this center
-            weight_sums[center_idx] = new_weight_sum
-
-            # re-scale the updated center with the total new counts
-            centers[center_idx] /= new_weight_sum
-
-            # update the incremental computation of the squared total
-            # centers position change
-            if compute_squared_diff:
-                for feature_idx in range(n_features):
-                    squared_diff += (old_center[feature_idx]
-                                     - centers[center_idx, feature_idx]) ** 2
-
-    return squared_diff
-
-
-def _minibatch_update_dense4(np.ndarray[floating, ndim=2, mode='c'] X,
-                            floating[::1] sample_weight,
-                            floating[:, ::1] centers,
-                            floating[::1] weight_sums,
-                            int[::1] labels,
-                            floating[::1] old_center,
-                            bint compute_squared_diff):
-    cdef:
-        floating squared_diff = 0
-        int n_samples = X.shape[0]
-        int n_features = X.shape[1]
-        int i, j, label
-        floating weight_sum, tmp, lr
-
-    with nogil:
-        # for i in prange(n_samples, nogil=True):
-        for i in range(n_samples):
-            label = labels[i]
-
-            # update center weight
-            weight_sum = weight_sums[label] + sample_weight[i]
-
-            # learning rate
-            if weight_sum > 0:
-                lr = 1 / weight_sum
-
-                if compute_squared_diff:
-                    for j in range(n_features):
-                        old_center[j] = centers[label, j]
-
-                for j in range(n_features):
-                    centers[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j]
-
-                if compute_squared_diff:
-                    for j in range(n_features):
-                        tmp = centers[label, j] - old_center[j]
-                        squared_diff += tmp * tmp
-        
-            weight_sums[label] = weight_sum
-        
-    return squared_diff
-
-
-def _minibatch_update_dense(
-        np.ndarray[floating, ndim=2, mode='c'] X,
-        floating[::1] sample_weight,
-        floating[:, ::1] centers,
-        floating[:, ::1] centers_new,
-        floating[::1] weight_sums,
-        int[::1] labels):
-    """"""
-    cdef:
-        int n_samples = X.shape[0]
-        int n_clusters = centers.shape[0]
-        int i
-
-        int *indices
-    
-    with nogil, parallel():
-        indices = <int*> malloc(n_samples * sizeof(int))
-
-        for i in prange(n_clusters):
-            update_cluster(i, &X[0, 0], centers, centers_new, labels,
-                           sample_weight, weight_sums, indices)
-        
-        free(indices)
-
-
-cdef void update_cluster(
-        int i,
-        floating *X,
-        floating[:, ::1] centers,
-        floating[:, ::1] centers_new,
-        int[::1] labels,
-        floating[::1] sample_weight,
-        floating[::1] weight_sums,
-        int *indices) nogil:
-    """"""
-    cdef:
-        int n_samples = sample_weight.shape[0]
-        int n_features = centers.shape[1]
-        floating alpha, tmp
-        int n_indices
-        int j, k, idx
-
-        floating wsum = 0
-
-    # indices = np.where(labels == i)
-    k = 0
-    for j in range(n_samples):
-        if labels[j] == i:
-            indices[k] = j
-            k += 1
-    n_indices = k
-
-    for j in range(n_indices):
-        idx = indices[j]
-        wsum += sample_weight[idx]
-
-    if wsum > 0:
-        # inplace remove previous count scaling
-        for k in range(n_features):
-            centers_new[i, k] = centers[i, k] * weight_sums[i]
-
-        # update cluster with new point members
-        for j in range(n_indices):
-            idx = indices[j]
-            for k in range(n_features):
-                centers_new[i, k] += X[idx * n_features + k] * sample_weight[idx]
-
-        # update the count statistics for this center
-        weight_sums[i] += wsum
-
-        # inplace rescale to compute mean of all points (old and new)
-        alpha = 1 / weight_sums[i]
-        for k in range(n_features):
-            centers_new[i, k] *= alpha
-    else:
-        for k in range(n_features):
-            centers_new[i, k] = centers[i, k]
diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
index e95c8fe0490a4..d4a392a7d2d6d 100644
--- a/sklearn/cluster/_k_means_elkan.pyx
+++ b/sklearn/cluster/_k_means_elkan.pyx
@@ -18,12 +18,12 @@ from libc.stdlib cimport calloc, free
 from libc.string cimport memset, memcpy
 
 from ..utils.extmath import row_norms
-from ._k_means_fast cimport _relocate_empty_clusters_dense
-from ._k_means_fast cimport _relocate_empty_clusters_sparse
-from ._k_means_fast cimport _euclidean_dense_dense
-from ._k_means_fast cimport _euclidean_sparse_dense
-from ._k_means_fast cimport _average_centers
-from ._k_means_fast cimport _center_shift
+from ._k_means_common cimport _relocate_empty_clusters_dense
+from ._k_means_common cimport _relocate_empty_clusters_sparse
+from ._k_means_common cimport _euclidean_dense_dense
+from ._k_means_common cimport _euclidean_sparse_dense
+from ._k_means_common cimport _average_centers
+from ._k_means_common cimport _center_shift
 
 
 np.import_array()
diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
index 747c841f6fe11..00377ae098458 100644
--- a/sklearn/cluster/_k_means_lloyd.pyx
+++ b/sklearn/cluster/_k_means_lloyd.pyx
@@ -17,9 +17,9 @@ from libc.float cimport DBL_MAX, FLT_MAX
 from ..utils.extmath import row_norms
 from ..utils._cython_blas cimport _gemm
 from ..utils._cython_blas cimport RowMajor, Trans, NoTrans
-from ._k_means_fast cimport _relocate_empty_clusters_dense
-from ._k_means_fast cimport _relocate_empty_clusters_sparse
-from ._k_means_fast cimport _average_centers, _center_shift
+from ._k_means_common cimport _relocate_empty_clusters_dense
+from ._k_means_common cimport _relocate_empty_clusters_sparse
+from ._k_means_common cimport _average_centers, _center_shift
 
 
 np.import_array()
diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
new file mode 100644
index 0000000000000..5132d219e6466
--- /dev/null
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -0,0 +1,255 @@
+# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True
+
+# TODO: We still need to use ndarrays instead of typed memoryviews when using
+# fused types and when the array may be read-only (for instance when it's
+# provided by the user). This is fixed in cython > 0.3.
+
+cimport numpy as np
+from cython cimport floating
+from cython.parallel cimport parallel, prange
+from libc.math cimport sqrt
+from libc.stdlib cimport malloc, free
+
+
+np.import_array()
+
+
+def _minibatch_update_dense(
+        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
+        floating[::1] sample_weight,               # IN
+        floating[:, ::1] centers_old,              # IN
+        floating[:, ::1] centers_new,              # OUT
+        floating[::1] weight_sums,                 # INOUT
+        int[::1] labels):                          # IN
+    """Update of the centers for dense MiniBatchKMeans.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features), dtype=floating
+        The observations to cluster.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_sums : ndarray of shape (n_clusters,), dtype=floating
+        Current sums of the accumulated weights for each center.
+    
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int i
+
+        int *indices
+    
+    with nogil, parallel():
+        indices = <int*> malloc(n_samples * sizeof(int))
+
+        for i in prange(n_clusters):
+            update_center_dense(i, &X[0, 0], sample_weight, centers_old,
+                                centers_new, weight_sums, labels, indices)
+        
+        free(indices)
+
+
+cdef void update_center_dense(
+        int i,
+        floating *X,                   # IN
+        floating[::1] sample_weight,   # IN
+        floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,  # OUT
+        floating[::1] weight_sums,     # INOUT
+        int[::1] labels,               # IN
+        int *indices) nogil:           # OUT
+    """Update of a single center for dense MinibatchKMeans"""
+    cdef:
+        int n_samples = sample_weight.shape[0]
+        int n_features = centers_old.shape[1]
+        floating alpha, tmp
+        int n_indices
+        int j, k, idx
+
+        floating wsum = 0
+
+    # indices = np.where(labels == i)[0]
+    k = 0
+    for j in range(n_samples):
+        if labels[j] == i:
+            indices[k] = j
+            k += 1
+    n_indices = k
+
+    for j in range(n_indices):
+        idx = indices[j]
+        wsum += sample_weight[idx]
+
+    if wsum > 0:
+        # Remove previous count scaling
+        for k in range(n_features):
+            centers_new[i, k] = centers_old[i, k] * weight_sums[i]
+
+        # Update cluster with new point members
+        for j in range(n_indices):
+            idx = indices[j]
+            for k in range(n_features):
+                centers_new[i, k] += X[idx * n_features + k] * sample_weight[idx]
+
+        # Update the count statistics for this center
+        weight_sums[i] += wsum
+
+        # Rescale to compute mean of all points (old and new)
+        alpha = 1 / weight_sums[i]
+        for k in range(n_features):
+            centers_new[i, k] *= alpha
+    else:
+        for k in range(n_features):
+            centers_new[i, k] = centers_old[i, k]
+
+
+def _minibatch_update_sparse(
+        X,                             # IN
+        floating[::1] sample_weight,   # IN
+        floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,  # OUT
+        floating[::1] weight_sums,     # INOUT
+        int[::1] labels):              # IN
+    """Update of the centers for sparse MiniBatchKMeans.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features), dtype=floating
+        The observations to cluster. Must be in CSR format.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_sums : ndarray of shape (n_clusters,), dtype=floating
+        Current sums of the accumulated weights for each center.
+    
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+    """
+    cdef:
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+        int n_samples = X.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int i
+
+        int *indices
+    
+    with nogil, parallel():
+        indices = <int*> malloc(n_samples * sizeof(int))
+
+        for i in prange(n_clusters):
+            update_center_sparse(i, X_data, X_indices, X_indptr, sample_weight,
+                                 centers_old, centers_new, weight_sums, labels,
+                                 indices)
+        
+        free(indices)
+
+
+cdef void update_center_sparse(
+        int i,
+        floating[::1] X_data,          # IN
+        int[::1] X_indices,            # IN
+        int[::1] X_indptr,             # IN
+        floating[::1] sample_weight,   # IN
+        floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,  # OUT
+        floating[::1] weight_sums,     # INOUT
+        int[::1] labels,               # IN
+        int *indices) nogil:           # OUT
+    """Update of a single center for sparse MinibatchKMeans"""
+    cdef:
+        int n_samples = sample_weight.shape[0]
+        int n_features = centers_old.shape[1]
+        floating alpha, tmp
+        int n_indices
+        int j, k, idx
+
+        floating wsum = 0
+
+    # indices = np.where(labels == i)[0]
+    k = 0
+    for j in range(n_samples):
+        if labels[j] == i:
+            indices[k] = j
+            k += 1
+    n_indices = k
+
+    for j in range(n_indices):
+        idx = indices[j]
+        wsum += sample_weight[idx]
+
+    if wsum > 0:
+        # Remove previous count scaling
+        for k in range(n_features):
+            centers_new[i, k] = centers_old[i, k] * weight_sums[i]
+
+        # Update cluster with new point members
+        for j in range(n_indices):
+            idx = indices[j]
+            for k in range(X_indptr[idx], X_indptr[idx + 1]):
+                centers_new[i, X_indices[k]] += X_data[k] * sample_weight[idx]
+
+        # Update the count statistics for this center
+        weight_sums[i] += wsum
+
+        # Rescale to compute mean of all points (old and new)
+        alpha = 1 / weight_sums[i]
+        for k in range(n_features):
+            centers_new[i, k] *= alpha
+    else:
+        for k in range(n_features):
+            centers_new[i, k] = centers_old[i, k]
+
+
+def _minibatch_update_dense4(np.ndarray[floating, ndim=2, mode='c'] X,
+                            floating[::1] sample_weight,
+                            floating[:, ::1] centers,
+                            floating[:, ::1] centers_new,
+                            floating[::1] weight_sums,
+                            int[::1] labels):
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int i, j, label
+        floating weight_sum, tmp, lr
+
+    # for i in prange(n_samples, nogil=True):
+    for i in range(n_samples):
+        label = labels[i]
+
+        # update center weight
+        weight_sum = weight_sums[label] + sample_weight[i]
+
+        # learning rate
+        if weight_sum > 0:
+            lr = 1 / weight_sum
+
+            for j in range(n_features):
+                centers_new[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j]
+        else:
+            centers_new[label, j] = centers[label, j]
+
+        weight_sums[label] = weight_sum
\ No newline at end of file
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 623643e61f511..514783eaf3b51 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -28,11 +28,11 @@
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..exceptions import ConvergenceWarning
-from ._k_means_fast import _inertia_dense
-from ._k_means_fast import _inertia_sparse
-from ._k_means_fast import _mini_batch_update_csr
-from ._k_means_fast import _minibatch_update_dense
-from ._k_means_fast import _minibatch_update_dense4
+from ._k_means_common import _inertia_dense
+from ._k_means_common import _inertia_sparse
+from ._k_means_minibatch import _minibatch_update_sparse
+from ._k_means_minibatch import _minibatch_update_dense
+from ._k_means_minibatch import _minibatch_update_dense4
 from ._k_means_lloyd import _lloyd_iter_chunked_dense
 from ._k_means_lloyd import _lloyd_iter_chunked_sparse
 from ._k_means_elkan import _init_bounds_dense
@@ -1136,89 +1136,80 @@ def score(self, X, y=None, sample_weight=None):
                                 self.cluster_centers_, self._n_threads)[1]
 
 
-def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
-                     centers_new, compute_squared_diff, random_state,
-                     random_reassign=False, reassignment_ratio=.01,
-                     verbose=False):
+def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new,
+                     weight_sums, random_state, random_reassign=False,
+                     reassignment_ratio=0.01, verbose=False):
     """Incremental update of the centers for the Minibatch K-Means algorithm.
 
     Parameters
     ----------
 
-    X : array, shape (n_samples, n_features)
-        The original data array.
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The original data array. In sparse, must be in CSR format.
+
+    x_squared_norms : ndarray of shape (n_samples,)
+        Squared euclidean norm of each data point.
 
-    sample_weight : array-like, shape (n_samples,)
+    sample_weight : ndarray of shape (n_samples,)
         The weights for each observation in X.
 
-    x_squared_norms : array, shape (n_samples,)
-        Squared euclidean norm of each data point.
+    # TODO better
+    centers : ndarray of shape (n_clusters, n_features)
+        The cluster centers.
 
-    centers : array, shape (k, n_features)
-        The cluster centers. This array is MODIFIED IN PLACE
+    centers_new : ndarray of shape (n_clusters, n_features)
+        TODO
 
-    counts : array, shape (k,)
-         The vector in which we keep track of the numbers of elements in a
-         cluster. This array is MODIFIED IN PLACE
+    weight_sums : ndarray of shape (n_clusters,)
+        The vector in which we keep track of the numbers of points in a
+        cluster. This array is modified in place.
 
     random_state : RandomState instance
         Determines random number generation for low count centers reassignment.
         See :term:`Glossary <random_state>`.
 
-    random_reassign : boolean, optional
+    random_reassign : boolean, default=False
         If True, centers with very low counts are randomly reassigned
         to observations.
 
-    reassignment_ratio : float, optional
+    reassignment_ratio : float, default=0.01
         Control the fraction of the maximum number of counts for a
         center to be reassigned. A higher value means that low count
         centers are more likely to be reassigned, which means that the
         model will take longer to converge, but should converge in a
         better clustering.
 
-    verbose : bool, optional, default False
+    verbose : bool, default=False
         Controls the verbosity.
 
-    compute_squared_diff : bool
-        If set to False, the squared diff computation is skipped.
-
-    old_center_buffer : int
-        Copy of old centers for monitoring convergence.
-
     Returns
     -------
     inertia : float
         Sum of squared distances of samples to their closest cluster center.
-
-    squared_diff : numpy array, shape (n_clusters,)
-        Squared distances between previous and updated cluster centers.
-
     """
     # Perform label assignment to nearest centers
     labels, inertia = _labels_inertia(X, sample_weight,
                                       x_squared_norms, centers)
 
-    # implementation for the sparse CSR representation completely written in
-    # cython
+    # Update centers according to the labels
     if sp.issparse(X):
-        _mini_batch_update_csr(
-            X, sample_weight, x_squared_norms, centers, weight_sums,
-            labels, centers_new, compute_squared_diff)
-
-    # dense variant in mostly numpy (not as memory efficient though.
+        _minibatch_update_sparse(
+            X, sample_weight, centers, centers_new, weight_sums, labels)
     else:
         _minibatch_update_dense(
             X, sample_weight, centers, centers_new, weight_sums, labels)
 
+    # Reassign clusters that have very low weight
     if random_reassign and reassignment_ratio > 0:
-        # Reassign clusters that have very low weight
         to_reassign = weight_sums < reassignment_ratio * weight_sums.max()
+
         # pick at most .5 * batch_size samples as new centers
         if to_reassign.sum() > .5 * X.shape[0]:
             indices_dont_reassign = \
                     np.argsort(weight_sums)[int(.5 * X.shape[0]):]
             to_reassign[indices_dont_reassign] = False
         n_reassigns = to_reassign.sum()
+
         if n_reassigns:
             # Pick new clusters amongst observations with uniform probability
             # TODO proba ~ distance like kmeans++ ?
@@ -1228,134 +1219,20 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
                 print(f"[MiniBatchKMeans] Reassigning {n_reassigns} "
                       f"cluster centers.")
 
-            if sp.issparse(X) and not sp.issparse(centers):
+            if sp.issparse(X):
                 assign_rows_csr(
                         X, new_centers.astype(np.intp, copy=False),
                         np.where(to_reassign)[0].astype(np.intp, copy=False),
                         centers)
             else:
                 centers_new[to_reassign] = X[new_centers]
+
         # reset counts of reassigned centers, but don't reset them too small
         # to avoid instant reassignment. This is a pretty dirty hack as it
         # also modifies the learning rates.
         weight_sums[to_reassign] = np.min(weight_sums[~to_reassign])
 
-    return inertia, None
-
-
-def _minibatch_update_dense3(X, sample_weight, centers, centers_new,
-                             weight_sums, labels):
-    for i in range(centers.shape[0]):
-        # find points from minibatch that are assigned to this center
-        mask = labels == i
-        wsum = sample_weight[mask].sum()
-
-        if wsum > 0:
-            # inplace remove previous count scaling
-            centers_new[i] = centers[i] * weight_sums[i]
-
-            # inplace sum with new points members of this cluster
-            centers_new[i] += np.sum(
-                X[mask] * sample_weight[mask, np.newaxis], axis=0)
-
-            # update the count statistics for this center
-            weight_sums[i] += wsum
-
-            # inplace rescale to compute mean of all points (old and new)
-            # Note: numpy >= 1.10 does not support '/=' for the following
-            # expression for a mixture of int and float (see numpy issue #6464)
-            centers_new[i] /= weight_sums[i]
-        else:
-            centers_new[i] = centers[i]
-
-
-def _minibatch_update_dense2(X, sample_weight, centers, weight_sums, labels,
-                            old_center_buffer, compute_squared_diff):
-    squared_diff = 0.0
-    for i in range(X.shape[0]):
-        label = labels[i]
-
-        # update center weight
-        weight_sums[label] += sample_weight[i]
-
-        # learning rate
-        if weight_sums[label] > 0:
-            lr = 1 / weight_sums[label]
-
-            if compute_squared_diff:
-                old_center_buffer[:] = centers[label]
-
-            centers[label] *= (1 - lr)
-            centers[label] += lr * X[i]
-
-            if compute_squared_diff:
-                diff = centers[label].ravel() - old_center_buffer.ravel()
-                squared_diff += np.dot(diff, diff)
-    
-    return squared_diff
-
-
-def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
-                            n_samples, centers_squared_diff, batch_inertia,
-                            context, verbose=0):
-    """Helper function to encapsulate the early stopping logic"""
-    # Normalize inertia to be able to compare values when
-    # batch_size changes
-    batch_inertia /= model.batch_size
-    centers_squared_diff /= model.batch_size
-
-    # Compute an Exponentially Weighted Average of the squared
-    # diff to monitor the convergence while discarding
-    # minibatch-local stochastic variability:
-    # https://en.wikipedia.org/wiki/Moving_average
-    ewa_diff = context.get('ewa_diff')
-    ewa_inertia = context.get('ewa_inertia')
-    if ewa_diff is None:
-        ewa_diff = centers_squared_diff
-        ewa_inertia = batch_inertia
-    else:
-        alpha = model.batch_size * 2.0 / (n_samples + 1)
-        alpha = min(alpha, 1.0)
-        ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha
-        ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha
-
-    # Log progress to be able to monitor convergence
-    if verbose:
-        progress_msg = (f"Minibatch iteration {iteration_idx + 1}/{n_iter}: "
-                        f"mean batch inertia: {batch_inertia}, ewa inertia: "
-                        f"{ewa_inertia}")
-        print(progress_msg)
-
-    # Early stopping based on absolute tolerance on squared change of
-    # centers position (using EWA smoothing)
-    if tol > 0.0 and ewa_diff <= tol:
-        if verbose:
-            print(f"Converged (small centers change) at iteration "
-                  f"{iteration_idx + 1}/{n_iter}")
-        return True
-
-    # Early stopping heuristic due to lack of improvement on smoothed inertia
-    ewa_inertia_min = context.get('ewa_inertia_min')
-    no_improvement = context.get('no_improvement', 0)
-    if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min:
-        no_improvement = 0
-        ewa_inertia_min = ewa_inertia
-    else:
-        no_improvement += 1
-
-    if (model.max_no_improvement is not None
-            and no_improvement >= model.max_no_improvement):
-        if verbose:
-            print(f"Converged (lack of improvement in inertia) at iteration "
-                  f"{iteration_idx}/{n_iter}")
-        return True
-
-    # update the convergence context to maintain state across successive calls:
-    context['ewa_diff'] = ewa_diff
-    context['ewa_inertia'] = ewa_inertia
-    context['ewa_inertia_min'] = ewa_inertia_min
-    context['no_improvement'] = no_improvement
-    return False
+    return inertia
 
 
 class MiniBatchKMeans(KMeans):
@@ -1592,6 +1469,78 @@ def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms,
         labels, inertia = zip(*results)
         return np.hstack(labels), np.sum(inertia)
 
+    def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
+                                centers_squared_diff, batch_inertia):
+        """Helper function to encapsulate the early stopping logic"""
+        # Normalize inertia to be able to compare values when
+        # batch_size changes
+        batch_inertia /= self.batch_size
+        centers_squared_diff /= self.batch_size
+
+        # We skip the first iteration because it would lead to a bad
+        # initialization of ewa_diff and ewa_inertia. The reason is that
+        # inertia is computed on centers before they are updated. Before the
+        # first iteration, centers are not yet the mean of their cluster.
+        if iteration_idx == 0:
+            if self.verbose:
+                print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: "
+                      f"mean batch inertia: {batch_inertia}, ewa inertia: "
+                      f"-")
+            return False
+
+        # Compute an Exponentially Weighted Average of the squared diff to
+        # monitor the convergence while discarding minibatch-local stochastic
+        # variability: https://en.wikipedia.org/wiki/Moving_average
+        ewa_diff = self._ewa_diff
+        ewa_inertia = self._ewa_inertia
+        if ewa_diff is None:
+            ewa_diff = centers_squared_diff
+            ewa_inertia = batch_inertia
+        else:
+            alpha = self.batch_size * 2.0 / (n_samples + 1)
+            ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha
+            ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha
+
+        # Log progress to be able to monitor convergence
+        if self.verbose:
+            print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: "
+                  f"mean batch inertia: {batch_inertia}, ewa inertia: "
+                  f"{ewa_inertia}")
+
+        # Early stopping based on absolute tolerance on squared change of
+        # centers position (using EWA smoothing)
+        if self._tol > 0.0 and ewa_diff <= self._tol:
+            if self.verbose:
+                print(f"Converged (small centers change) at iteration "
+                      f"{iteration_idx + 1}/{n_iter}")
+            return True
+
+        # Early stopping heuristic due to lack of improvement on smoothed
+        # inertia
+        ewa_inertia_min = self._ewa_inertia_min
+        no_improvement = self._no_improvement
+        if iteration_idx >= 5:
+            if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min:
+                no_improvement = 0
+                ewa_inertia_min = ewa_inertia
+            else:
+                no_improvement += 1
+
+            if (self.max_no_improvement is not None
+                    and no_improvement >= self.max_no_improvement):
+                if self.verbose:
+                    print(f"Converged (lack of improvement in inertia) at "
+                        f"iteration {iteration_idx}/{n_iter}")
+                return True
+
+        # update the convergence context to maintain state across successive
+        # calls:
+        self._ewa_diff = ewa_diff
+        self._ewa_inertia = ewa_inertia
+        self._ewa_inertia_min = ewa_inertia_min
+        self._no_improvement = no_improvement
+        return False
+
     def fit(self, X, y=None, sample_weight=None):
         """Compute the centroids on X by chunking it into mini-batches.
 
@@ -1601,6 +1550,8 @@ def fit(self, X, y=None, sample_weight=None):
             Training instances to cluster. It must be noted that the data
             will be converted to C ordering, which will cause a memory copy
             if the given data is not C-contiguous.
+            If a sparse matrix is passed, a copy will be made if it's not in
+            CSR format.
 
         y : Ignored
             Not used, present here for API consistency by convention.
@@ -1632,18 +1583,9 @@ def fit(self, X, y=None, sample_weight=None):
         # precompute squared norms of data points
         x_squared_norms = row_norms(X, squared=True)
 
-        if self._tol > 0.0:
-            # using tol-based early stopping needs the allocation of a
-            # dedicated before which can be expensive for high dim data:
-            # hence we allocate it outside of the main loop
-            old_center_buffer = np.zeros(n_features, dtype=X.dtype)
-        else:
-            # no need for the center buffer if tol-based early stopping is
-            # disabled
-            old_center_buffer = np.zeros(0, dtype=X.dtype)
-
         validation_indices = random_state.randint(0, n_samples,
-                                                  self._init_size)
+                                                #   self._init_size,
+                                                  self.batch_size)
         X_valid = X[validation_indices]
         sample_weight_valid = sample_weight[validation_indices]
         x_squared_norms_valid = x_squared_norms[validation_indices]
@@ -1678,51 +1620,55 @@ def fit(self, X, y=None, sample_weight=None):
         # Initialize counts
         self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
 
-        # Empty conext to be used inplace by the convergence check routine
-        convergence_context = {}
+        # Attributes to monitor the convergence
+        self._ewa_diff = None
+        self._ewa_inertia = None
+        self._ewa_inertia_min = None
+        self._no_improvement = 0
 
         n_batches = int(np.ceil(float(n_samples) / self.batch_size))
         n_iter = int(self.max_iter * n_batches)
 
         # Perform the iterative optimization until convergence
-        for iteration_idx in range(n_iter):
+        for i in range(n_iter):
             # Sample a minibatch from the full dataset
             minibatch_indices = random_state.randint(0, n_samples,
                                                      self.batch_size)
 
+            # Here we randomly choose whether to perform random reassignment:
+            # the choice is done as a function of the iteration index, and the
+            # minimum number of counts, in order to force this reassignment to
+            # happen every once in a while.
+            random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0
+
             # Perform the actual update step on the minibatch data
-            batch_inertia, centers_squared_diff = _mini_batch_step(
+            batch_inertia = _mini_batch_step(
                 X=X[minibatch_indices],
-                sample_weight=sample_weight[minibatch_indices],
                 x_squared_norms=x_squared_norms[minibatch_indices],
+                sample_weight=sample_weight[minibatch_indices],
                 centers=centers,
-                weight_sums=self._counts,
                 centers_new=centers_new,
-                compute_squared_diff=self._tol > 0.0,
+                weight_sums=self._counts,
                 random_state=random_state,
-                # Here we randomly choose whether to perform
-                # random reassignment: the choice is done as a function
-                # of the iteration index, and the minimum number of
-                # counts, in order to force this reassignment to happen
-                # every once in a while
-                random_reassign=((iteration_idx + 1)
-                                 % (10 + int(self._counts.min())) == 0),
+                random_reassign=random_reassign,
                 reassignment_ratio=self.reassignment_ratio,
                 verbose=self.verbose)
 
-            centers_squared_diff = np.sum((centers_new - centers)**2)
+            if self._tol > 0.0:
+                centers_squared_diff = np.sum((centers_new - centers)**2)
+            else:
+                centers_squared_diff = 0
+
             centers, centers_new = centers_new, centers
 
             # Monitor convergence and do early stopping if necessary
-            if _mini_batch_convergence(
-                    self, iteration_idx, n_iter, self._tol, n_samples,
-                    centers_squared_diff, batch_inertia, convergence_context,
-                    verbose=self.verbose):
+            if self._mini_batch_convergence(
+                    i, n_iter, n_samples, centers_squared_diff, batch_inertia):
                 break
 
         self.cluster_centers_ = centers
 
-        self.n_iter_ = iteration_idx + 1
+        self.n_iter_ = i + 1
 
         if self.compute_labels:
             self.labels_, self.inertia_ = self._labels_inertia_minibatch(
@@ -1799,12 +1745,11 @@ def partial_fit(self, X, y=None, sample_weight=None):
                     f"data {self.cluster_centers_.shape[1]}.")
 
         _mini_batch_step(X,
-                         sample_weight=sample_weight,
                          x_squared_norms=x_squared_norms,
+                         sample_weight=sample_weight,
                          centers=self.cluster_centers_,
+                         centers_new=self.cluster_centers_,
                          weight_sums=self._counts,
-                         old_center_buffer=np.zeros(0, dtype=X.dtype),
-                         compute_squared_diff=False,
                          random_state=self._random_state,
                          random_reassign=random_reassign,
                          reassignment_ratio=self.reassignment_ratio,
diff --git a/sklearn/cluster/setup.py b/sklearn/cluster/setup.py
index 48ed25c5c0eaf..9a85541731e5f 100644
--- a/sklearn/cluster/setup.py
+++ b/sklearn/cluster/setup.py
@@ -25,8 +25,8 @@ def configuration(parent_package='', top_path=None):
                          include_dirs=[numpy.get_include()],
                          libraries=libraries)
 
-    config.add_extension('_k_means_fast',
-                         sources=['_k_means_fast.pyx'],
+    config.add_extension('_k_means_common',
+                         sources=['_k_means_common.pyx'],
                          include_dirs=[numpy.get_include()],
                          libraries=libraries)
 
@@ -40,6 +40,11 @@ def configuration(parent_package='', top_path=None):
                          include_dirs=[numpy.get_include()],
                          libraries=libraries)
 
+    config.add_extension('_k_means_minibatch',
+                         sources=['_k_means_minibatch.pyx'],
+                         include_dirs=[numpy.get_include()],
+                         libraries=libraries)
+
     config.add_subpackage('tests')
 
     return config
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index dcc16e904cff7..bf4f0e03f829d 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -72,8 +72,8 @@ def test_all_init(estimator, data, init):
 
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 def test_result_equal_in_diff_n_threads(estimator):
-    # Check that KMeans gives the same results in parallel mode than in
-    # sequential mode.
+    # Check that KMeans/MiniBatchKMeans give the same results in parallel mode
+    # than in sequential mode.
     rnd = np.random.RandomState(0)
     X = rnd.normal(size=(50, 10))
 
@@ -132,20 +132,34 @@ def test_unit_weights_vs_no_weights(estimator):
     assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_)
 
 
+@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_scaled_weights(estimator):
+def test_scaled_weights(estimator, data):
     # Check that scaling all sample weights by a common factor
     # shouldn't change the result
     sample_weight = np.random.uniform(n_samples)
 
     km = estimator(n_clusters=n_clusters, random_state=42)
-    km_orig = clone(km).fit(X, sample_weight=sample_weight)
-    km_scaled = clone(km).fit(X, sample_weight=0.5 * sample_weight)
+    km_orig = clone(km).fit(data, sample_weight=sample_weight)
+    km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight)
 
     assert_array_equal(km_orig.labels_, km_scaled.labels_)
     assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_)
 
 
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_dense_sparse(estimator):
+    # Check that the results are the same for dense and sparse input.
+    sample_weight = np.random.RandomState(0).random_sample((n_samples,))
+    km_dense = estimator(n_clusters=n_clusters, random_state=0, n_init=1)
+    km_dense.fit(X, sample_weight=sample_weight)
+    km_sparse = estimator(n_clusters=n_clusters, random_state=0, n_init=1)
+    km_sparse.fit(X_csr, sample_weight=sample_weight)
+
+    assert_array_equal(km_dense.labels_, km_sparse.labels_)
+    assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_)
+
+
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 def test_fortran_aligned_data(estimator):
     # Check that KMeans works with fortran-aligned data.
@@ -175,9 +189,10 @@ def test_centers_not_mutated(estimator, dtype):
     assert not np.may_share_memory(km.cluster_centers_, centers)
 
 
-@pytest.mark.parametrize("data", [X, X_csr], ids=["sparse", "dense"])
+@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 def test_float_precision(estimator, data):
+    # TODO
     km = estimator(n_init=1, random_state=0)
 
     inertia = {}

From 76c3589affb40acceaf349056329a138a32903ca Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 6 Mar 2020 14:52:22 +0100
Subject: [PATCH 08/72] wip

---
 sklearn/cluster/_kmeans.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 514783eaf3b51..2c45bf8873b1d 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1530,7 +1530,7 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
                     and no_improvement >= self.max_no_improvement):
                 if self.verbose:
                     print(f"Converged (lack of improvement in inertia) at "
-                        f"iteration {iteration_idx}/{n_iter}")
+                          f"iteration {iteration_idx}/{n_iter}")
                 return True
 
         # update the convergence context to maintain state across successive
@@ -1584,8 +1584,7 @@ def fit(self, X, y=None, sample_weight=None):
         x_squared_norms = row_norms(X, squared=True)
 
         validation_indices = random_state.randint(0, n_samples,
-                                                #   self._init_size,
-                                                  self.batch_size)
+                                                  self._init_size)
         X_valid = X[validation_indices]
         sample_weight_valid = sample_weight[validation_indices]
         x_squared_norms_valid = x_squared_norms[validation_indices]

From 231542d2f1a0795f824bd8818bfed3382bfedd91 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 6 Mar 2020 15:23:15 +0100
Subject: [PATCH 09/72] wip

---
 sklearn/cluster/_kmeans.py | 46 ++++++++++----------------------------
 1 file changed, 12 insertions(+), 34 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 2c45bf8873b1d..bf19e681201be 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -283,8 +283,8 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++',
 
 
 def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
-                         verbose=False, x_squared_norms=None,
-                         random_state=None, tol=1e-4, n_threads=1):
+                         verbose=False, x_squared_norms=None, tol=1e-4,
+                         n_threads=1):
     """A single run of k-means lloyd, assumes preparation completed prior.
 
     Parameters
@@ -307,11 +307,6 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
     x_squared_norms : array-like, default=None
         Precomputed x_squared_norms.
 
-    random_state : int, RandomState instance, default=None
-        Determines random number generation for centroid initialization. Use
-        an int to make the randomness deterministic.
-        See :term:`Glossary <random_state>`.
-
     tol : float, default=1e-4
         Relative tolerance with regards to Frobenius norm of the difference
         in the cluster centers of two consecutive iterations to declare
@@ -340,8 +335,6 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
     n_iter : int
         Number of iterations run.
     """
-    random_state = check_random_state(random_state)
-
     n_samples = X.shape[0]
     n_clusters = centers_init.shape[0]
 
@@ -406,8 +399,8 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
 
 
 def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
-                         verbose=False, x_squared_norms=None,
-                         random_state=None, tol=1e-4, n_threads=1):
+                         verbose=False, x_squared_norms=None, tol=1e-4,
+                         n_threads=1):
     """A single run of k-means lloyd, assumes preparation completed prior.
 
     Parameters
@@ -430,11 +423,6 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
     x_squared_norms : ndarray of shape(n_samples,), default=None
         Precomputed x_squared_norms.
 
-    random_state : int, RandomState instance or None, default=None
-        Determines random number generation for centroid initialization. Use
-        an int to make the randomness deterministic.
-        See :term:`Glossary <random_state>`.
-
     tol : float, default=1e-4
         Relative tolerance with regards to Frobenius norm of the difference
         in the cluster centers of two consecutive iterations to declare
@@ -463,8 +451,6 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
     n_iter : int
         Number of iterations run.
     """
-    random_state = check_random_state(random_state)
-
     n_clusters = centers_init.shape[0]
 
     # Buffers to avoid new allocations at each iteration.
@@ -1477,17 +1463,6 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
         batch_inertia /= self.batch_size
         centers_squared_diff /= self.batch_size
 
-        # We skip the first iteration because it would lead to a bad
-        # initialization of ewa_diff and ewa_inertia. The reason is that
-        # inertia is computed on centers before they are updated. Before the
-        # first iteration, centers are not yet the mean of their cluster.
-        if iteration_idx == 0:
-            if self.verbose:
-                print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: "
-                      f"mean batch inertia: {batch_inertia}, ewa inertia: "
-                      f"-")
-            return False
-
         # Compute an Exponentially Weighted Average of the squared diff to
         # monitor the convergence while discarding minibatch-local stochastic
         # variability: https://en.wikipedia.org/wiki/Moving_average
@@ -1601,11 +1576,14 @@ def fit(self, X, y=None, sample_weight=None):
                 X, x_squared_norms=x_squared_norms, init=init,
                 random_state=random_state, init_size=self._init_size)
 
-            # Keep the best cluster centers across independent inits based on
-            # inertia computed on a common validation set.
-            _, inertia = _labels_inertia(X_valid, sample_weight_valid,
-                                         x_squared_norms_valid,
-                                         cluster_centers)
+            # Preform one iteration of KMeans to make the centers being the
+            # mean of their cluster.
+            _, inertia, cluster_centers, _ = _kmeans_single_lloyd(
+                X=X_valid, x_squared_norms=x_squared_norms_valid,
+                sample_weight=sample_weight_valid,
+                centers_init=cluster_centers, max_iter=1, tol=0,
+                n_threads=self._n_threads)
+
             if self.verbose:
                 print(f"Inertia for init {init_idx + 1}/{self._n_init}: "
                       f"{inertia}")

From 3f475f61b203a199816a4bc36d27e26e77bc2d03 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 6 Mar 2020 18:00:48 +0100
Subject: [PATCH 10/72] wip

---
 sklearn/cluster/_kmeans.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index eb4170dea199d..3193e665544be 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1476,6 +1476,7 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
             ewa_inertia = batch_inertia
         else:
             alpha = self.batch_size * 2.0 / (n_samples + 1)
+            alpha = min(alpha, 1)
             ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha
             ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha
 
@@ -1497,19 +1498,18 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
         # inertia
         ewa_inertia_min = self._ewa_inertia_min
         no_improvement = self._no_improvement
-        if iteration_idx >= 5:
-            if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min:
-                no_improvement = 0
-                ewa_inertia_min = ewa_inertia
-            else:
-                no_improvement += 1
+        if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min:
+            no_improvement = 0
+            ewa_inertia_min = ewa_inertia
+        else:
+            no_improvement += 1
 
-            if (self.max_no_improvement is not None
-                    and no_improvement >= self.max_no_improvement):
-                if self.verbose:
-                    print(f"Converged (lack of improvement in inertia) at "
-                          f"iteration {iteration_idx}/{n_iter}")
-                return True
+        if (self.max_no_improvement is not None
+                and no_improvement >= self.max_no_improvement):
+            if self.verbose:
+                print(f"Converged (lack of improvement in inertia) at "
+                      f"iteration {iteration_idx}/{n_iter}")
+            return True
 
         # update the convergence context to maintain state across successive
         # calls:
@@ -1542,11 +1542,9 @@ def fit(self, X, y=None, sample_weight=None):
         -------
         self
         """
-        # TODO accept_large_sparse ???
         X = self._validate_data(X, accept_sparse='csr',
                                 dtype=[np.float64, np.float32],
-                                order='C', copy=self.copy_x,
-                                accept_large_sparse=False)
+                                order='C', accept_large_sparse=False)
 
         n_samples, n_features = X.shape
 

From f73077b5133117f6db038b6d9b66fdc3dfbef244 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 9 Mar 2020 19:02:02 +0100
Subject: [PATCH 11/72] wip

---
 sklearn/cluster/_k_means_common.pyx | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/sklearn/cluster/_k_means_common.pyx b/sklearn/cluster/_k_means_common.pyx
index 38276a0baa50f..dde6fe01efa61 100644
--- a/sklearn/cluster/_k_means_common.pyx
+++ b/sklearn/cluster/_k_means_common.pyx
@@ -15,6 +15,7 @@
 import numpy as np
 cimport numpy as np
 from cython cimport floating
+from cython.parallel cimport prange
 from libc.math cimport sqrt
 
 from ..utils.extmath import row_norms
@@ -95,7 +96,8 @@ cpdef floating _inertia_dense(
         np.ndarray[floating, ndim=2, mode='c'] X,  # IN
         floating[::1] sample_weight,               # IN
         floating[:, ::1] centers,                  # IN
-        int[::1] labels):                          # IN
+        int[::1] labels,                           # IN
+        int n_threads):
     """Compute inertia for dense input data
 
     Sum of squared distance between each sample and its assigned center.
@@ -108,7 +110,7 @@ cpdef floating _inertia_dense(
         floating sq_dist = 0.0
         floating inertia = 0.0
 
-    for i in range(n_samples):
+    for i in prange(n_samples, nogil=True, num_threads=n_threads):
         j = labels[i]
         sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
                                          n_features, True)
@@ -121,7 +123,8 @@ cpdef floating _inertia_sparse(
         X,                            # IN
         floating[::1] sample_weight,  # IN
         floating[:, ::1] centers,     # IN
-        int[::1] labels):             # IN
+        int[::1] labels,              # IN
+        int n_threads):
     """Compute inertia for sparse input data
 
     Sum of squared distance between each sample and its assigned center.
@@ -140,7 +143,7 @@ cpdef floating _inertia_sparse(
 
         floating[::1] centers_squared_norms = row_norms(centers, squared=True)
 
-    for i in range(n_samples):
+    for i in prange(n_samples, nogil=True, num_threads=n_threads):
         j = labels[i]
         sq_dist = _euclidean_sparse_dense(
             X_data[X_indptr[i]: X_indptr[i + 1]],

From 21d5d24cd8158b515ede391a9a38eed88ed62e6b Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 9 Mar 2020 19:23:39 +0100
Subject: [PATCH 12/72] wip

---
 sklearn/cluster/_kmeans.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 3193e665544be..e1770998d634a 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -375,7 +375,7 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
                                             kth=1, axis=0)[1]
 
         if verbose:
-            inertia = _inertia(X, sample_weight, centers, labels)
+            inertia = _inertia(X, sample_weight, centers, labels, n_threads)
             print(f"Iteration {i}, inertia {inertia}")
 
         centers, centers_new = centers_new, centers
@@ -394,7 +394,7 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
                    lower_bounds, labels, center_shift, n_threads,
                    update_centers=False)
 
-    inertia = _inertia(X, sample_weight, centers, labels)
+    inertia = _inertia(X, sample_weight, centers, labels, n_threads)
 
     return labels, inertia, centers, i + 1
 
@@ -473,7 +473,7 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
                    weight_in_clusters, labels, center_shift, n_threads)
 
         if verbose:
-            inertia = _inertia(X, sample_weight, centers, labels)
+            inertia = _inertia(X, sample_weight, centers, labels, n_threads)
             print(f"Iteration {i}, inertia {inertia}.")
 
         centers, centers_new = centers_new, centers
@@ -491,7 +491,7 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
                    weight_in_clusters, labels, center_shift, n_threads,
                    update_centers=False)
 
-    inertia = _inertia(X, sample_weight, centers, labels)
+    inertia = _inertia(X, sample_weight, centers, labels, n_threads)
 
     return labels, inertia, centers, i + 1
 
@@ -551,7 +551,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
             weight_in_clusters, labels, center_shift, n_threads,
             update_centers=False)
 
-    inertia = _inertia(X, sample_weight, centers, labels)
+    inertia = _inertia(X, sample_weight, centers, labels, n_threads)
 
     return labels, inertia
 

From a5f9cad84a0a7ae8ebafad69434d035b34c4c90d Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Tue, 10 Mar 2020 15:01:01 +0100
Subject: [PATCH 13/72] wip

---
 sklearn/cluster/_k_means_minibatch.pyx | 16 ++++++++
 sklearn/cluster/_kmeans.py             | 51 +++++++++++++++++++-------
 sklearn/cluster/tests/test_k_means.py  | 39 +++++++-------------
 3 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index 5132d219e6466..3310298696009 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -9,11 +9,27 @@ from cython cimport floating
 from cython.parallel cimport parallel, prange
 from libc.math cimport sqrt
 from libc.stdlib cimport malloc, free
+from libc.string cimport memcpy, memset
 
 
 np.import_array()
 
 
+def _copy_minibatch_to_buffer(
+        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
+        floating[:, ::1] minibatch_buffer,         # OUT
+        int[::1] indices,                          # IN
+        int n_threads):
+    """"""
+    cdef:
+        int n_samples_minibatch = minibatch_buffer.shape[0]
+        int n_features = minibatch_buffer.shape[1]
+        int i, j, idx
+
+    for i in prange(n_samples_minibatch, nogil=True, num_threads=n_threads):
+        memcpy(&minibatch_buffer[i, 0], &X[indices[i], 0], n_features * sizeof(floating))
+
+
 def _minibatch_update_dense(
         np.ndarray[floating, ndim=2, mode='c'] X,  # IN
         floating[::1] sample_weight,               # IN
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index e1770998d634a..6825dea2ae274 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -31,6 +31,7 @@
 from ..exceptions import ConvergenceWarning
 from ._k_means_common import _inertia_dense
 from ._k_means_common import _inertia_sparse
+from ._k_means_minibatch import _copy_minibatch_to_buffer
 from ._k_means_minibatch import _minibatch_update_sparse
 from ._k_means_minibatch import _minibatch_update_dense
 from ._k_means_minibatch import _minibatch_update_dense4
@@ -962,8 +963,7 @@ def fit(self, X, y=None, sample_weight=None):
                 labels, inertia, centers, n_iter_ = kmeans_single(
                     X, sample_weight, centers_init, max_iter=self.max_iter,
                     verbose=self.verbose, tol=self._tol,
-                    x_squared_norms=x_squared_norms, random_state=random_state,
-                    n_threads=self._n_threads)
+                    x_squared_norms=x_squared_norms, n_threads=self._n_threads)
                 # determine if these results are the best so far
                 if best_inertia is None or inertia < best_inertia:
                     best_labels = labels
@@ -1568,6 +1568,9 @@ def fit(self, X, y=None, sample_weight=None):
         sample_weight_valid = sample_weight[validation_indices]
         x_squared_norms_valid = x_squared_norms[validation_indices]
 
+        # TODO comment
+        centers_new = np.empty((self.n_clusters, n_features), dtype=X.dtype)
+
         # perform several inits with random sub-sets
         best_inertia = None
         for init_idx in range(self._n_init):
@@ -1580,26 +1583,36 @@ def fit(self, X, y=None, sample_weight=None):
                 X, x_squared_norms=x_squared_norms, init=init,
                 random_state=random_state, init_size=self._init_size)
 
-            # Preform one iteration of KMeans to make the centers being the
-            # mean of their cluster.
-            _, inertia, cluster_centers, _ = _kmeans_single_lloyd(
-                X=X_valid, x_squared_norms=x_squared_norms_valid,
+            # # Preform one iteration of KMeans to make the centers being the
+            # # mean of their cluster.
+            # labels, inertia, cluster_centers, _ = _kmeans_single_lloyd(
+            #     X=X_valid, x_squared_norms=x_squared_norms_valid,
+            #     sample_weight=sample_weight_valid,
+            #     centers_init=cluster_centers, max_iter=1, tol=0,
+            #     n_threads=self._n_threads)
+            weight_sums = np.zeros(self.n_clusters, dtype=X.dtype)
+
+            inertia = _mini_batch_step(
+                X=X_valid,
+                x_squared_norms=x_squared_norms_valid,
                 sample_weight=sample_weight_valid,
-                centers_init=cluster_centers, max_iter=1, tol=0,
-                n_threads=self._n_threads)
+                centers=cluster_centers,
+                centers_new=centers_new,
+                weight_sums=weight_sums,
+                random_state=random_state)
 
             if self.verbose:
                 print(f"Inertia for init {init_idx + 1}/{self._n_init}: "
                       f"{inertia}")
             if best_inertia is None or inertia < best_inertia:
                 init_centers = cluster_centers
+                self._counts = weight_sums
                 best_inertia = inertia
 
         centers = init_centers
-        centers_new = np.empty_like(centers)
 
         # Initialize counts
-        self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
+        # self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
 
         # Attributes to monitor the convergence
         self._ewa_diff = None
@@ -1610,11 +1623,22 @@ def fit(self, X, y=None, sample_weight=None):
         n_batches = int(np.ceil(float(n_samples) / self.batch_size))
         n_iter = int(self.max_iter * n_batches)
 
+        if not sp.issparse(X):
+            minibatch_buffer = np.empty((self.batch_size, n_features),
+                                        dtype=X.dtype)
+
         # Perform the iterative optimization until convergence
         for i in range(n_iter):
             # Sample a minibatch from the full dataset
-            minibatch_indices = random_state.randint(0, n_samples,
-                                                     self.batch_size)
+            minibatch_indices = random_state.randint(
+                0, n_samples, self.batch_size).astype(np.int32, copy=False)
+
+            if sp.issparse(X):
+                X_minibatch = X[minibatch_indices]
+            else:
+                X_minibatch = minibatch_buffer
+                _copy_minibatch_to_buffer(X, minibatch_buffer,
+                                          minibatch_indices, self._n_threads)
 
             # Here we randomly choose whether to perform random reassignment:
             # the choice is done as a function of the iteration index, and the
@@ -1624,7 +1648,8 @@ def fit(self, X, y=None, sample_weight=None):
 
             # Perform the actual update step on the minibatch data
             batch_inertia = _mini_batch_step(
-                X=X[minibatch_indices],
+                X=X_minibatch,
+                # X=X[minibatch_indices],
                 x_squared_norms=x_squared_norms[minibatch_indices],
                 sample_weight=sample_weight[minibatch_indices],
                 centers=centers,
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index bf4f0e03f829d..0f457cba41d07 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -21,12 +21,12 @@
 from sklearn.cluster import KMeans, k_means
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.cluster._kmeans import _mini_batch_step
-from sklearn.cluster._k_means_fast import _relocate_empty_clusters_dense
-from sklearn.cluster._k_means_fast import _relocate_empty_clusters_sparse
-from sklearn.cluster._k_means_fast import _euclidean_dense_dense_wrapper
-from sklearn.cluster._k_means_fast import _euclidean_sparse_dense_wrapper
-from sklearn.cluster._k_means_fast import _inertia_dense
-from sklearn.cluster._k_means_fast import _inertia_sparse
+from sklearn.cluster._k_means_common import _relocate_empty_clusters_dense
+from sklearn.cluster._k_means_common import _relocate_empty_clusters_sparse
+from sklearn.cluster._k_means_common import _euclidean_dense_dense_wrapper
+from sklearn.cluster._k_means_common import _euclidean_sparse_dense_wrapper
+from sklearn.cluster._k_means_common import _inertia_dense
+from sklearn.cluster._k_means_common import _inertia_sparse
 from sklearn.datasets import make_blobs
 from io import StringIO
 
@@ -124,7 +124,7 @@ def test_unit_weights_vs_no_weights(estimator):
     # sample weights all equal to one.
     sample_weight = np.ones(n_samples)
 
-    km = estimator(n_clusters=n_clusters, random_state=42)
+    km = estimator(n_clusters=n_clusters, random_state=42, n_init=1)
     km_none = clone(km).fit(X, sample_weight=None)
     km_ones = clone(km).fit(X, sample_weight=sample_weight)
 
@@ -139,7 +139,7 @@ def test_scaled_weights(estimator, data):
     # shouldn't change the result
     sample_weight = np.random.uniform(n_samples)
 
-    km = estimator(n_clusters=n_clusters, random_state=42)
+    km = estimator(n_clusters=n_clusters, random_state=42, n_init=1)
     km_orig = clone(km).fit(data, sample_weight=sample_weight)
     km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight)
 
@@ -226,19 +226,6 @@ def test_float_precision(estimator, data):
     assert_array_equal(labels[np.float32], labels[np.float64])
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_score_multiple_inits(estimator):
-    # Check that fitting KMeans or MiniBatchKMeans with multiple inits gives
-    # better score
-    X = np.random.RandomState(0).randn(100, 10)
-
-    km1 = estimator(max_iter=10, random_state=42, n_init=1)
-    s1 = km1.fit(X).score(X)
-    km2 = estimator(max_iter=10, random_state=42, n_init=10)
-    s2 = km2.fit(X).score(X)
-    assert s2 > s1
-
-
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 def test_score_max_iter(estimator):
     # Check that fitting KMeans or MiniBatchKMeans with more iterations gives
@@ -361,8 +348,8 @@ def test_transform(estimator):
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 def test_fit_transform(estimator):
     # Check equivalence between fit.transform and fit_transform
-    X1 = estimator(n_clusters=n_clusters, random_state=0).fit(X).transform(X)
-    X2 = estimator(n_clusters=n_clusters, random_state=0).fit_transform(X)
+    X1 = estimator(random_state=0, n_init=1).fit(X).transform(X)
+    X2 = estimator(random_state=0, n_init=1).fit_transform(X)
     assert_allclose(X1, X2)
 
 
@@ -620,8 +607,10 @@ def test_inertia(dtype):
     distances = ((X_dense - centers[labels])**2).sum(axis=1)
     expected = np.sum(distances * sample_weight)
 
-    inertia_dense = _inertia_dense(X_dense, sample_weight, centers, labels)
-    inertia_sparse = _inertia_sparse(X_sparse, sample_weight, centers, labels)
+    inertia_dense = _inertia_dense(
+        X_dense, sample_weight, centers, labels, 1)
+    inertia_sparse = _inertia_sparse(
+        X_sparse, sample_weight, centers, labels, 1)
 
     assert_allclose(inertia_dense, inertia_sparse, rtol=1e-6)
     assert_allclose(inertia_dense, expected, rtol=1e-6)

From c4fb7a815dfe83c30b081603bb8b62161fb97a45 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Tue, 10 Mar 2020 15:48:23 +0100
Subject: [PATCH 14/72] wip

---
 sklearn/cluster/_kmeans.py | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 6825dea2ae274..bb5bda8ec229b 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1568,9 +1568,6 @@ def fit(self, X, y=None, sample_weight=None):
         sample_weight_valid = sample_weight[validation_indices]
         x_squared_norms_valid = x_squared_norms[validation_indices]
 
-        # TODO comment
-        centers_new = np.empty((self.n_clusters, n_features), dtype=X.dtype)
-
         # perform several inits with random sub-sets
         best_inertia = None
         for init_idx in range(self._n_init):
@@ -1583,36 +1580,26 @@ def fit(self, X, y=None, sample_weight=None):
                 X, x_squared_norms=x_squared_norms, init=init,
                 random_state=random_state, init_size=self._init_size)
 
-            # # Preform one iteration of KMeans to make the centers being the
-            # # mean of their cluster.
-            # labels, inertia, cluster_centers, _ = _kmeans_single_lloyd(
-            #     X=X_valid, x_squared_norms=x_squared_norms_valid,
-            #     sample_weight=sample_weight_valid,
-            #     centers_init=cluster_centers, max_iter=1, tol=0,
-            #     n_threads=self._n_threads)
-            weight_sums = np.zeros(self.n_clusters, dtype=X.dtype)
-
-            inertia = _mini_batch_step(
-                X=X_valid,
-                x_squared_norms=x_squared_norms_valid,
+            # Preform one iteration of KMeans to make the centers being the
+            # mean of their cluster.
+            labels, inertia, cluster_centers, _ = _kmeans_single_lloyd(
+                X=X_valid, x_squared_norms=x_squared_norms_valid,
                 sample_weight=sample_weight_valid,
-                centers=cluster_centers,
-                centers_new=centers_new,
-                weight_sums=weight_sums,
-                random_state=random_state)
+                centers_init=cluster_centers, max_iter=1, tol=0,
+                n_threads=self._n_threads)
 
             if self.verbose:
                 print(f"Inertia for init {init_idx + 1}/{self._n_init}: "
                       f"{inertia}")
             if best_inertia is None or inertia < best_inertia:
                 init_centers = cluster_centers
-                self._counts = weight_sums
                 best_inertia = inertia
 
         centers = init_centers
+        centers_new = np.empty_like(centers)
 
         # Initialize counts
-        # self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
+        self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
 
         # Attributes to monitor the convergence
         self._ewa_diff = None

From 3713094fd5488e92d5a228b565e9fafc214dfd05 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Tue, 10 Mar 2020 16:08:13 +0100
Subject: [PATCH 15/72] wip

---
 sklearn/cluster/_k_means_minibatch.pyx | 2 +-
 sklearn/cluster/_kmeans.py             | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index 3310298696009..692c51aa3e8f2 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -9,7 +9,7 @@ from cython cimport floating
 from cython.parallel cimport parallel, prange
 from libc.math cimport sqrt
 from libc.stdlib cimport malloc, free
-from libc.string cimport memcpy, memset
+from libc.string cimport memcpy
 
 
 np.import_array()
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index bb5bda8ec229b..2aef5b47cbdea 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1627,7 +1627,7 @@ def fit(self, X, y=None, sample_weight=None):
                 _copy_minibatch_to_buffer(X, minibatch_buffer,
                                           minibatch_indices, self._n_threads)
 
-            # Here we randomly choose whether to perform random reassignment:
+            # Randomly choose whether to perform random reassignment:
             # the choice is done as a function of the iteration index, and the
             # minimum number of counts, in order to force this reassignment to
             # happen every once in a while.
@@ -1636,7 +1636,6 @@ def fit(self, X, y=None, sample_weight=None):
             # Perform the actual update step on the minibatch data
             batch_inertia = _mini_batch_step(
                 X=X_minibatch,
-                # X=X[minibatch_indices],
                 x_squared_norms=x_squared_norms[minibatch_indices],
                 sample_weight=sample_weight[minibatch_indices],
                 centers=centers,

From 6a6fbfb7d1bf4987f9811897e650b49acdc66d47 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Tue, 10 Mar 2020 16:34:34 +0100
Subject: [PATCH 16/72] wip

---
 sklearn/cluster/_k_means_minibatch.pyx | 15 ---------------
 sklearn/cluster/_kmeans.py             | 18 +++---------------
 2 files changed, 3 insertions(+), 30 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index 692c51aa3e8f2..54ec96de0abb4 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -15,21 +15,6 @@ from libc.string cimport memcpy
 np.import_array()
 
 
-def _copy_minibatch_to_buffer(
-        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
-        floating[:, ::1] minibatch_buffer,         # OUT
-        int[::1] indices,                          # IN
-        int n_threads):
-    """"""
-    cdef:
-        int n_samples_minibatch = minibatch_buffer.shape[0]
-        int n_features = minibatch_buffer.shape[1]
-        int i, j, idx
-
-    for i in prange(n_samples_minibatch, nogil=True, num_threads=n_threads):
-        memcpy(&minibatch_buffer[i, 0], &X[indices[i], 0], n_features * sizeof(floating))
-
-
 def _minibatch_update_dense(
         np.ndarray[floating, ndim=2, mode='c'] X,  # IN
         floating[::1] sample_weight,               # IN
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 2aef5b47cbdea..4ebd3db50a8df 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -31,7 +31,6 @@
 from ..exceptions import ConvergenceWarning
 from ._k_means_common import _inertia_dense
 from ._k_means_common import _inertia_sparse
-from ._k_means_minibatch import _copy_minibatch_to_buffer
 from ._k_means_minibatch import _minibatch_update_sparse
 from ._k_means_minibatch import _minibatch_update_dense
 from ._k_means_minibatch import _minibatch_update_dense4
@@ -1610,22 +1609,11 @@ def fit(self, X, y=None, sample_weight=None):
         n_batches = int(np.ceil(float(n_samples) / self.batch_size))
         n_iter = int(self.max_iter * n_batches)
 
-        if not sp.issparse(X):
-            minibatch_buffer = np.empty((self.batch_size, n_features),
-                                        dtype=X.dtype)
-
         # Perform the iterative optimization until convergence
         for i in range(n_iter):
             # Sample a minibatch from the full dataset
-            minibatch_indices = random_state.randint(
-                0, n_samples, self.batch_size).astype(np.int32, copy=False)
-
-            if sp.issparse(X):
-                X_minibatch = X[minibatch_indices]
-            else:
-                X_minibatch = minibatch_buffer
-                _copy_minibatch_to_buffer(X, minibatch_buffer,
-                                          minibatch_indices, self._n_threads)
+            minibatch_indices = random_state.randint(0, n_samples,
+                                                     self.batch_size)
 
             # Randomly choose whether to perform random reassignment:
             # the choice is done as a function of the iteration index, and the
@@ -1635,7 +1623,7 @@ def fit(self, X, y=None, sample_weight=None):
 
             # Perform the actual update step on the minibatch data
             batch_inertia = _mini_batch_step(
-                X=X_minibatch,
+                X=X[minibatch_indices],
                 x_squared_norms=x_squared_norms[minibatch_indices],
                 sample_weight=sample_weight[minibatch_indices],
                 centers=centers,

From 3be5343c9ff7cdeeb23955ddef98489e6730a8a8 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Tue, 10 Mar 2020 16:35:07 +0100
Subject: [PATCH 17/72] wip

---
 sklearn/cluster/_k_means_minibatch.pyx | 31 --------------------------
 sklearn/cluster/_kmeans.py             |  1 -
 2 files changed, 32 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index 54ec96de0abb4..ce3a62be256d7 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -223,34 +223,3 @@ cdef void update_center_sparse(
     else:
         for k in range(n_features):
             centers_new[i, k] = centers_old[i, k]
-
-
-def _minibatch_update_dense4(np.ndarray[floating, ndim=2, mode='c'] X,
-                            floating[::1] sample_weight,
-                            floating[:, ::1] centers,
-                            floating[:, ::1] centers_new,
-                            floating[::1] weight_sums,
-                            int[::1] labels):
-    cdef:
-        int n_samples = X.shape[0]
-        int n_features = X.shape[1]
-        int i, j, label
-        floating weight_sum, tmp, lr
-
-    # for i in prange(n_samples, nogil=True):
-    for i in range(n_samples):
-        label = labels[i]
-
-        # update center weight
-        weight_sum = weight_sums[label] + sample_weight[i]
-
-        # learning rate
-        if weight_sum > 0:
-            lr = 1 / weight_sum
-
-            for j in range(n_features):
-                centers_new[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j]
-        else:
-            centers_new[label, j] = centers[label, j]
-
-        weight_sums[label] = weight_sum
\ No newline at end of file
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 4ebd3db50a8df..1aad382627f80 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -33,7 +33,6 @@
 from ._k_means_common import _inertia_sparse
 from ._k_means_minibatch import _minibatch_update_sparse
 from ._k_means_minibatch import _minibatch_update_dense
-from ._k_means_minibatch import _minibatch_update_dense4
 from ._k_means_lloyd import _lloyd_iter_chunked_dense
 from ._k_means_lloyd import _lloyd_iter_chunked_sparse
 from ._k_means_elkan import _init_bounds_dense

From 2add01e64484aa6ec78f1dcbd491e591e7e32529 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 22 Apr 2020 12:40:01 +0200
Subject: [PATCH 18/72] wip

---
 sklearn/cluster/_kmeans.py             | 14 ++---
 sklearn/cluster/tests/test_k_means.py  | 70 ++++++++++++++++++---
 sklearn/cluster/tests/test_k_means2.py | 85 ++------------------------
 3 files changed, 71 insertions(+), 98 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 1aad382627f80..46d208a27be71 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1210,7 +1210,7 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new,
                 assign_rows_csr(
                         X, new_centers.astype(np.intp, copy=False),
                         np.where(to_reassign)[0].astype(np.intp, copy=False),
-                        centers)
+                        centers_new)
             else:
                 centers_new[to_reassign] = X[new_centers]
 
@@ -1462,20 +1462,16 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
         # Normalize inertia to be able to compare values when
         # batch_size changes
         batch_inertia /= self.batch_size
-        centers_squared_diff /= self.batch_size
 
-        # Compute an Exponentially Weighted Average of the squared diff to
+        # Compute an Exponentially Weighted Average of the inertia to
         # monitor the convergence while discarding minibatch-local stochastic
         # variability: https://en.wikipedia.org/wiki/Moving_average
-        ewa_diff = self._ewa_diff
         ewa_inertia = self._ewa_inertia
-        if ewa_diff is None:
-            ewa_diff = centers_squared_diff
+        if ewa_inertia is None:
             ewa_inertia = batch_inertia
         else:
             alpha = self.batch_size * 2.0 / (n_samples + 1)
             alpha = min(alpha, 1)
-            ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha
             ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha
 
         # Log progress to be able to monitor convergence
@@ -1486,7 +1482,7 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
 
         # Early stopping based on absolute tolerance on squared change of
         # centers position (using EWA smoothing)
-        if self._tol > 0.0 and ewa_diff <= self._tol:
+        if self._tol > 0.0 and centers_squared_diff <= self._tol:
             if self.verbose:
                 print(f"Converged (small centers change) at iteration "
                       f"{iteration_idx + 1}/{n_iter}")
@@ -1511,7 +1507,6 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
 
         # update the convergence context to maintain state across successive
         # calls:
-        self._ewa_diff = ewa_diff
         self._ewa_inertia = ewa_inertia
         self._ewa_inertia_min = ewa_inertia_min
         self._no_improvement = no_improvement
@@ -1600,7 +1595,6 @@ def fit(self, X, y=None, sample_weight=None):
         self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
 
         # Attributes to monitor the convergence
-        self._ewa_diff = None
         self._ewa_inertia = None
         self._ewa_inertia_min = None
         self._no_improvement = 0
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 0f457cba41d07..17ae1967b374c 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -21,6 +21,7 @@
 from sklearn.cluster import KMeans, k_means
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.cluster._kmeans import _mini_batch_step
+from sklearn.cluster._kmeans import _labels_inertia
 from sklearn.cluster._k_means_common import _relocate_empty_clusters_dense
 from sklearn.cluster._k_means_common import _relocate_empty_clusters_sparse
 from sklearn.cluster._k_means_common import _euclidean_dense_dense_wrapper
@@ -647,16 +648,69 @@ def test_minibatch_kmeans_init_size():
     assert km._init_size == n_samples
 
 
-def test_minibatch_kmeans_partial_fit():
-    # Check fitting using the partial_fit API
-    km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)
+def test_minibatch_sensible_reassign():
+    # check that identical initial clusters are reassigned
+    # also a regression test for when there are more desired reassignments than
+    # samples.
+    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
+                                       random_state=42)
+    zeroed_X[::2, :] = 0
 
-    for X_minibatch in np.array_split(X, 10):
-        km.partial_fit(X_minibatch)
+    km = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
+                         init="random").fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    assert km.cluster_centers_.any(axis=1).sum() > 10
 
-    # compute the labeling on the complete dataset
-    labels = km.predict(X)
-    assert_allclose(v_measure_score(true_labels, labels), 1.0)
+    # do the same with batch-size > X.shape[0] (regression test)
+    km = MiniBatchKMeans(n_clusters=20, batch_size=200, random_state=42,
+                         init="random").fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    assert km.cluster_centers_.any(axis=1).sum() > 10
+
+    # do the same with partial_fit API
+    km = MiniBatchKMeans(n_clusters=20, random_state=42, init="random")
+    for i in range(100):
+        km.partial_fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    assert km.cluster_centers_.any(axis=1).sum() > 10
+
+
+@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
+def test_minibatch_reassign(data):
+    # Check the reassignment part of the minibatch step with very high or very
+    # low reassignment ratio.
+    perfect_centers = np.empty((n_clusters, n_features))
+    for i in range(n_clusters):
+        perfect_centers[i] = X[true_labels == i].mean(axis=0)
+
+    x_squared_norms = row_norms(data, squared=True)
+    sample_weight = np.ones(n_samples)
+    centers_new = np.empty_like(perfect_centers)
+
+    # Give a perfect initialization, but a large reassignment_ratio, as a
+    # result many centers should be reassigned and the model should no longer
+    # be good
+    score_before = - _labels_inertia(data, sample_weight, x_squared_norms,
+                                     perfect_centers, 1)[1]
+
+    _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers,
+                     centers_new, np.zeros(n_clusters),
+                     np.random.RandomState(0), random_reassign=True,
+                     reassignment_ratio=1)
+
+    score_after = - _labels_inertia(data, sample_weight, x_squared_norms,
+                                    centers_new, 1)[1]
+
+    assert score_before > score_after
+
+    # Give a perfect initialization, with a small reassignment_ratio,
+    # no center should be reassigned.
+    _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers,
+                     centers_new, np.zeros(n_clusters),
+                     np.random.RandomState(0), random_reassign=True,
+                     reassignment_ratio=1e-15)
+
+    assert_allclose(centers_new, perfect_centers)
 
 
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
diff --git a/sklearn/cluster/tests/test_k_means2.py b/sklearn/cluster/tests/test_k_means2.py
index 7df2bf1b0efb3..93c6d8011be80 100644
--- a/sklearn/cluster/tests/test_k_means2.py
+++ b/sklearn/cluster/tests/test_k_means2.py
@@ -97,95 +97,20 @@ def test_minibatch_update_consistency():
     assert_almost_equal(new_inertia, new_inertia_csr)
 
 
-def test_minibatch_sensible_reassign_fit():
-    # check if identical initial clusters are reassigned
-    # also a regression test for when there are more desired reassignments than
-    # samples.
-    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
-                                       cluster_std=1., random_state=42)
-    zeroed_X[::2, :] = 0
-    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
-                                 init="random")
-    mb_k_means.fit(zeroed_X)
-    # there should not be too many exact zero cluster centers
-    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
-
-    # do the same with batch-size > X.shape[0] (regression test)
-    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
-                                 random_state=42, init="random")
-    mb_k_means.fit(zeroed_X)
-    # there should not be too many exact zero cluster centers
-    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
-
-
-def test_minibatch_sensible_reassign_partial_fit():
-    zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5,
-                                       cluster_std=1., random_state=42)
-    zeroed_X[::2, :] = 0
-    mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random")
-    for i in range(100):
-        mb_k_means.partial_fit(zeroed_X)
-    # there should not be too many exact zero cluster centers
-    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
-
-
-def test_minibatch_reassign():
-    # Give a perfect initialization, but a large reassignment_ratio,
-    # as a result all the centers should be reassigned and the model
-    # should no longer be good
-    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
-    for this_X in (X, X_csr):
-        mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100,
-                                     random_state=42)
-        mb_k_means.fit(this_X)
-
-        score_before = mb_k_means.score(this_X)
-        try:
-            old_stdout = sys.stdout
-            sys.stdout = StringIO()
-            # Turn on verbosity to smoke test the display code
-            _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
-                             mb_k_means.cluster_centers_,
-                             mb_k_means._counts,
-                             np.zeros(X.shape[1], np.double),
-                             False, random_state=np.random.RandomState(0),
-                             random_reassign=True,
-                             reassignment_ratio=1, verbose=True)
-        finally:
-            sys.stdout = old_stdout
-        assert score_before > mb_k_means.score(this_X)
-
-    # Give a perfect initialization, with a small reassignment_ratio,
-    # no center should be reassigned
-    for this_X in (X, X_csr):
-        mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100,
-                                     init=centers.copy(),
-                                     random_state=42, n_init=1)
-        mb_k_means.fit(this_X)
-        clusters_before = mb_k_means.cluster_centers_
-        # Turn on verbosity to smoke test the display code
-        _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
-                         mb_k_means.cluster_centers_,
-                         mb_k_means._counts,
-                         np.zeros(X.shape[1], np.double),
-                         False, random_state=np.random.RandomState(0),
-                         random_reassign=True,
-                         reassignment_ratio=1e-15)
-        assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_)
-
-
 def test_minibatch_with_many_reassignments():
     # Test for the case that the number of clusters to reassign is bigger
     # than the batch_size
-    n_samples = 550
+    n_samples = 1000
     rnd = np.random.RandomState(42)
     X = rnd.uniform(size=(n_samples, 10))
     # Check that the fit works if n_clusters is bigger than the batch_size.
     # Run the test with 550 clusters and 550 samples, because it turned out
     # that this values ensure that the number of clusters to reassign
     # is always bigger than the batch_size
-    n_clusters = 550
+    n_clusters = 1000
     MiniBatchKMeans(n_clusters=n_clusters,
                     batch_size=100,
                     init_size=n_samples,
-                    random_state=42).fit(X)
+                    random_state=42,
+                    verbose=True).fit(X)
+    assert False

From 7d7ab15c5a5501bd9893eb9a8d64faeb7a5b8db2 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 15 Jun 2020 17:20:39 +0200
Subject: [PATCH 19/72] wip

---
 sklearn/cluster/tests/test_k_means.py  |  69 +++++++++++++++
 sklearn/cluster/tests/test_k_means2.py | 116 -------------------------
 2 files changed, 69 insertions(+), 116 deletions(-)
 delete mode 100644 sklearn/cluster/tests/test_k_means2.py

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index c26151219303c..c3e94c8622b0a 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -629,6 +629,63 @@ def test_k_means_function():
     assert inertia > 0.0
 
 
+def test_minibatch_update_consistency():
+    # Check that dense and sparse minibatch update give the same results
+    rng = np.random.RandomState(42)
+
+    centers_old = centers + rng.normal(size=centers.shape)
+    centers_old_csr = centers_old.copy()
+
+    centers_new = np.zeros_like(centers_old)
+    centers_new_csr = np.zeros_like(centers_old_csr)
+
+    weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype)
+    weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype)
+
+    x_squared_norms = (X ** 2).sum(axis=1)
+    x_squared_norms_csr = row_norms(X_csr, squared=True)
+
+    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
+
+    # extract a small minibatch
+    X_mb = X[:10]
+    X_mb_csr = X_csr[:10]
+    x_mb_squared_norms = x_squared_norms[:10]
+    x_mb_squared_norms_csr = x_squared_norms_csr[:10]
+    sample_weight_mb = sample_weight[:10]
+
+    # step 1: compute the dense minibatch update
+    old_inertia = _mini_batch_step(
+        X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new,
+        weight_sums, np.random.RandomState(0), random_reassign=False)
+    assert old_inertia > 0.0
+
+    # compute the new inertia on the same batch to check that it decreased
+    labels, new_inertia = _labels_inertia(
+        X_mb, sample_weight_mb, x_mb_squared_norms, centers_new)
+    assert new_inertia > 0.0
+    assert new_inertia < old_inertia
+
+    # step 2: compute the sparse minibatch update
+    old_inertia_csr = _mini_batch_step(
+        X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr,
+        centers_new_csr, weight_sums_csr, np.random.RandomState(0),
+        random_reassign=False)
+    assert old_inertia_csr > 0.0
+
+    # compute the new inertia on the same batch to check that it decreased
+    labels_csr, new_inertia_csr = _labels_inertia(
+        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr)
+    assert new_inertia_csr > 0.0
+    assert new_inertia_csr < old_inertia_csr
+
+    # step 3: check that sparse and dense updates lead to the same results
+    assert_array_equal(labels, labels_csr)
+    assert_allclose(centers_new, centers_new_csr)
+    assert_allclose(old_inertia, old_inertia_csr)
+    assert_allclose(new_inertia, new_inertia_csr)
+
+
 def test_minibatch_kmeans_init_size():
     # Check the internal _init_size attribute of MiniBatchKMeans
 
@@ -711,6 +768,18 @@ def test_minibatch_reassign(data):
     assert_allclose(centers_new, perfect_centers)
 
 
+def test_minibatch_with_many_reassignments():
+    # Test for the case that the number of clusters to reassign is bigger
+    # than the batch_size. Run the test with 100 clusters and a batch_size of
+    # 10 because it turned out that these values ensure that the number of
+    # clusters to reassign is always bigger than the batch_size.
+    MiniBatchKMeans(n_clusters=100,
+                    batch_size=10,
+                    init_size=n_samples,
+                    random_state=42,
+                    verbose=True).fit(X)
+
+
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 @pytest.mark.parametrize("param, match", [
     ({"n_init": 0}, r"n_init should be > 0"),
diff --git a/sklearn/cluster/tests/test_k_means2.py b/sklearn/cluster/tests/test_k_means2.py
deleted file mode 100644
index 93c6d8011be80..0000000000000
--- a/sklearn/cluster/tests/test_k_means2.py
+++ /dev/null
@@ -1,116 +0,0 @@
-"""Testing for K-means"""
-import sys
-
-import numpy as np
-from scipy import sparse as sp
-
-from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_almost_equal
-
-from sklearn.utils.extmath import row_norms
-from sklearn.cluster import MiniBatchKMeans
-from sklearn.cluster._kmeans import _labels_inertia
-from sklearn.cluster._kmeans import _mini_batch_step
-from sklearn.datasets import make_blobs
-from io import StringIO
-
-
-# non centered, sparse centers to check the
-centers = np.array([
-    [0.0, 5.0, 0.0, 0.0, 0.0],
-    [1.0, 1.0, 4.0, 0.0, 0.0],
-    [1.0, 0.0, 0.0, 5.0, 1.0],
-])
-n_samples = 100
-n_clusters, n_features = centers.shape
-X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
-                            cluster_std=1., random_state=42)
-X_csr = sp.csr_matrix(X)
-
-
-def test_minibatch_update_consistency():
-    # Check that dense and sparse minibatch update give the same results
-    rng = np.random.RandomState(42)
-    old_centers = centers + rng.normal(size=centers.shape)
-
-    new_centers = old_centers.copy()
-    new_centers_csr = old_centers.copy()
-
-    weight_sums = np.zeros(new_centers.shape[0], dtype=np.double)
-    weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double)
-
-    x_squared_norms = (X ** 2).sum(axis=1)
-    x_squared_norms_csr = row_norms(X_csr, squared=True)
-
-    buffer = np.zeros(centers.shape[1], dtype=np.double)
-    buffer_csr = np.zeros(centers.shape[1], dtype=np.double)
-
-    # extract a small minibatch
-    X_mb = X[:10]
-    X_mb_csr = X_csr[:10]
-    x_mb_squared_norms = x_squared_norms[:10]
-    x_mb_squared_norms_csr = x_squared_norms_csr[:10]
-
-    sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double)
-
-    # step 1: compute the dense minibatch update
-    old_inertia, incremental_diff = _mini_batch_step(
-        X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums,
-        buffer, 1, np.random.RandomState(0), random_reassign=False)
-    assert old_inertia > 0.0
-
-    # compute the new inertia on the same batch to check that it decreased
-    labels, new_inertia = _labels_inertia(
-        X_mb, sample_weight_mb, x_mb_squared_norms, new_centers)
-    assert new_inertia > 0.0
-    assert new_inertia < old_inertia
-
-    # check that the incremental difference computation is matching the
-    # final observed value
-    effective_diff = np.sum((new_centers - old_centers) ** 2)
-    assert_almost_equal(incremental_diff, effective_diff)
-
-    # step 2: compute the sparse minibatch update
-    old_inertia_csr, incremental_diff_csr = _mini_batch_step(
-        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr,
-        weight_sums_csr, buffer_csr, 1, np.random.RandomState(0),
-        random_reassign=False)
-    assert old_inertia_csr > 0.0
-
-    # compute the new inertia on the same batch to check that it decreased
-    labels_csr, new_inertia_csr = _labels_inertia(
-        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr)
-    assert new_inertia_csr > 0.0
-    assert new_inertia_csr < old_inertia_csr
-
-    # check that the incremental difference computation is matching the
-    # final observed value
-    effective_diff = np.sum((new_centers_csr - old_centers) ** 2)
-    assert_almost_equal(incremental_diff_csr, effective_diff)
-
-    # step 3: check that sparse and dense updates lead to the same results
-    assert_array_equal(labels, labels_csr)
-    assert_array_almost_equal(new_centers, new_centers_csr)
-    assert_almost_equal(incremental_diff, incremental_diff_csr)
-    assert_almost_equal(old_inertia, old_inertia_csr)
-    assert_almost_equal(new_inertia, new_inertia_csr)
-
-
-def test_minibatch_with_many_reassignments():
-    # Test for the case that the number of clusters to reassign is bigger
-    # than the batch_size
-    n_samples = 1000
-    rnd = np.random.RandomState(42)
-    X = rnd.uniform(size=(n_samples, 10))
-    # Check that the fit works if n_clusters is bigger than the batch_size.
-    # Run the test with 550 clusters and 550 samples, because it turned out
-    # that this values ensure that the number of clusters to reassign
-    # is always bigger than the batch_size
-    n_clusters = 1000
-    MiniBatchKMeans(n_clusters=n_clusters,
-                    batch_size=100,
-                    init_size=n_samples,
-                    random_state=42,
-                    verbose=True).fit(X)
-    assert False

From 0523c656a09ddb070579aaa0b8789fe971b89581 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 17 Jun 2020 13:02:07 +0200
Subject: [PATCH 20/72] wip

---
 sklearn/cluster/_kmeans.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 63d25baea3e54..033819c094018 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1477,6 +1477,13 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
         # batch_size changes
         batch_inertia /= self.batch_size
 
+        # Ignore first iteration because it's inertia from initialization.
+        if iteration_idx == 0:
+            if self.verbose:
+                print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: "
+                      f"mean batch inertia: {batch_inertia}")
+            return False
+
         # Compute an Exponentially Weighted Average of the inertia to
         # monitor the convergence while discarding minibatch-local stochastic
         # variability: https://en.wikipedia.org/wiki/Moving_average

From 2d789aa510e28a2a77d3b6dfb8015c1c6690e442 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 17 Jun 2020 15:22:06 +0200
Subject: [PATCH 21/72] wip

---
 sklearn/cluster/tests/test_k_means.py | 28 ++++++++++-----------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index e0441048a4ff5..4be91c1d52bf5 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -90,32 +90,24 @@ def _sort_centers(centers):
     return np.sort(centers, axis=0)
 
 
-@pytest.mark.parametrize("init", ["k-means++", centers],
-                         ids=["k-means++", "ndarray"])
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_weighted_vs_repeated(estimator, init):
+def test_weighted_vs_repeated():
     # Check that a sample weight of N should yield the same result as an N-fold
-    # repetition of the sample
+    # repetition of the sample. Valid only if init is precomputed, otherwise
+    # rng produces different results. Not valid for MinibatchKMeans due to rng
+    # to extract minibatches.
     sample_weight = np.random.RandomState(0).randint(1, 5, size=n_samples)
     X_repeat = np.repeat(X, sample_weight, axis=0)
 
-    km = estimator(init=init, n_clusters=n_clusters, random_state=0)
-    if estimator is MiniBatchKMeans:
-        km.set_params(batch_size=10)
+    km = KMeans(init=centers, n_init=1, n_clusters=n_clusters, random_state=0)
 
     km_weighted = clone(km).fit(X, sample_weight=sample_weight)
     repeated_labels = np.repeat(km_weighted.labels_, sample_weight)
     km_repeated = clone(km).fit(X_repeat)
 
-    # We can't expect labels to be equal because k-means++ will lead to
-    # a different initialization on duplicated X.
-    assert_allclose(v_measure_score(km_repeated.labels_, repeated_labels), 1)
-
-    # TODO: FIXME
-    if estimator is not MiniBatchKMeans:
-        assert_allclose(km_weighted.inertia_, km_repeated.inertia_)
-        assert_allclose(_sort_centers(km_weighted.cluster_centers_),
-                        _sort_centers(km_repeated.cluster_centers_))
+    assert_array_equal(km_repeated.labels_, repeated_labels)
+    assert_allclose(km_weighted.inertia_, km_repeated.inertia_)
+    assert_allclose(_sort_centers(km_weighted.cluster_centers_),
+                    _sort_centers(km_repeated.cluster_centers_))
 
 
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
@@ -192,7 +184,7 @@ def test_centers_not_mutated(estimator, dtype):
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 def test_float_precision(estimator, data):
-    # TODO
+    # Check that the results are the same for single and double precision.
     km = estimator(n_init=1, random_state=0)
 
     inertia = {}

From 37408e6d269e0f1567a5ef5dc42f6544e949a516 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 17 Jun 2020 15:24:22 +0200
Subject: [PATCH 22/72] wip

---
 sklearn/cluster/_k_means_common.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_k_means_common.pyx b/sklearn/cluster/_k_means_common.pyx
index dde6fe01efa61..53c33acffc2ee 100644
--- a/sklearn/cluster/_k_means_common.pyx
+++ b/sklearn/cluster/_k_means_common.pyx
@@ -110,7 +110,8 @@ cpdef floating _inertia_dense(
         floating sq_dist = 0.0
         floating inertia = 0.0
 
-    for i in prange(n_samples, nogil=True, num_threads=n_threads):
+    for i in prange(n_samples, nogil=True, num_threads=n_threads,
+                    schedule='static'):
         j = labels[i]
         sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
                                          n_features, True)
@@ -143,7 +144,8 @@ cpdef floating _inertia_sparse(
 
         floating[::1] centers_squared_norms = row_norms(centers, squared=True)
 
-    for i in prange(n_samples, nogil=True, num_threads=n_threads):
+    for i in prange(n_samples, nogil=True, num_threads=n_threads,
+                    schedule='static'):
         j = labels[i]
         sq_dist = _euclidean_sparse_dense(
             X_data[X_indptr[i]: X_indptr[i + 1]],

From 78915acfb09bee2754828db1b5dddb320ad8ff4d Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 17 Jun 2020 15:48:25 +0200
Subject: [PATCH 23/72] wip

---
 sklearn/cluster/_k_means_minibatch.pyx | 24 ++++++++++++++--------
 sklearn/cluster/_kmeans.py             | 28 +++++++++++++++++---------
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index ce3a62be256d7..ec5b98f201346 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -16,12 +16,13 @@ np.import_array()
 
 
 def _minibatch_update_dense(
-        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
+        np.ndarray[floating, ndim=2, mode="c"] X,  # IN
         floating[::1] sample_weight,               # IN
         floating[:, ::1] centers_old,              # IN
         floating[:, ::1] centers_new,              # OUT
         floating[::1] weight_sums,                 # INOUT
-        int[::1] labels):                          # IN
+        int[::1] labels,                           # IN
+        int n_threads):
     """Update of the centers for dense MiniBatchKMeans.
 
     Parameters
@@ -45,6 +46,9 @@ def _minibatch_update_dense(
     
     labels : ndarray of shape (n_samples,), dtype=int
         labels assignment.
+
+    n_threads : int
+        The number of threads to be used by openmp.
     """
     cdef:
         int n_samples = X.shape[0]
@@ -53,10 +57,10 @@ def _minibatch_update_dense(
 
         int *indices
     
-    with nogil, parallel():
+    with nogil, parallel(num_threads=n_threads):
         indices = <int*> malloc(n_samples * sizeof(int))
 
-        for i in prange(n_clusters):
+        for i in prange(n_clusters, schedule="static"):
             update_center_dense(i, &X[0, 0], sample_weight, centers_old,
                                 centers_new, weight_sums, labels, indices)
         
@@ -123,7 +127,8 @@ def _minibatch_update_sparse(
         floating[:, ::1] centers_old,  # IN
         floating[:, ::1] centers_new,  # OUT
         floating[::1] weight_sums,     # INOUT
-        int[::1] labels):              # IN
+        int[::1] labels,               # IN
+        int n_threads):
     """Update of the centers for sparse MiniBatchKMeans.
 
     Parameters
@@ -144,9 +149,12 @@ def _minibatch_update_sparse(
 
     weight_sums : ndarray of shape (n_clusters,), dtype=floating
         Current sums of the accumulated weights for each center.
-    
+
     labels : ndarray of shape (n_samples,), dtype=int
         labels assignment.
+
+    n_threads : int
+        The number of threads to be used by openmp.
     """
     cdef:
         floating[::1] X_data = X.data
@@ -158,10 +166,10 @@ def _minibatch_update_sparse(
 
         int *indices
     
-    with nogil, parallel():
+    with nogil, parallel(num_threads=n_threads):
         indices = <int*> malloc(n_samples * sizeof(int))
 
-        for i in prange(n_clusters):
+        for i in prange(n_clusters, schedule="static"):
             update_center_sparse(i, X_data, X_indices, X_indptr, sample_weight,
                                  centers_old, centers_new, weight_sums, labels,
                                  indices)
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 7675ed0d92235..016e2a4343ce0 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1141,7 +1141,7 @@ def _more_tags(self):
 
 def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new,
                      weight_sums, random_state, random_reassign=False,
-                     reassignment_ratio=0.01, verbose=False):
+                     reassignment_ratio=0.01, verbose=False, n_threads=1):
     """Incremental update of the centers for the Minibatch K-Means algorithm.
 
     Parameters
@@ -1184,6 +1184,9 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new,
     verbose : bool, default=False
         Controls the verbosity.
 
+    n_threads : int, default=1
+        The number of OpenMP threads to use for the computation.
+
     Returns
     -------
     inertia : float
@@ -1191,15 +1194,16 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new,
     """
     # Perform label assignment to nearest centers
     labels, inertia = _labels_inertia(X, sample_weight,
-                                      x_squared_norms, centers)
+                                      x_squared_norms, centers,
+                                      n_threads=n_threads)
 
     # Update centers according to the labels
     if sp.issparse(X):
-        _minibatch_update_sparse(
-            X, sample_weight, centers, centers_new, weight_sums, labels)
+        _minibatch_update_sparse(X, sample_weight, centers, centers_new,
+                                 weight_sums, labels, n_threads)
     else:
-        _minibatch_update_dense(
-            X, sample_weight, centers, centers_new, weight_sums, labels)
+        _minibatch_update_dense(X, sample_weight, centers, centers_new,
+                                weight_sums, labels, n_threads)
 
     # Reassign clusters that have very low weight
     if random_reassign and reassignment_ratio > 0:
@@ -1466,7 +1470,8 @@ def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms,
             print('Computing label assignment and total inertia')
         slices = gen_batches(X.shape[0], self.batch_size)
         results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s],
-                                   centers) for s in slices]
+                                   centers, n_threads=self._n_threads)
+                   for s in slices]
         labels, inertia = zip(*results)
         return np.hstack(labels), np.sum(inertia)
 
@@ -1648,7 +1653,8 @@ def fit(self, X, y=None, sample_weight=None):
                 random_state=random_state,
                 random_reassign=random_reassign,
                 reassignment_ratio=self.reassignment_ratio,
-                verbose=self.verbose)
+                verbose=self.verbose,
+                n_threads=self._n_threads)
 
             if self._tol > 0.0:
                 centers_squared_diff = np.sum((centers_new - centers)**2)
@@ -1747,11 +1753,13 @@ def partial_fit(self, X, y=None, sample_weight=None):
                          random_state=self._random_state,
                          random_reassign=random_reassign,
                          reassignment_ratio=self.reassignment_ratio,
-                         verbose=self.verbose)
+                         verbose=self.verbose,
+                         n_threads=self._n_threads)
 
         if self.compute_labels:
             self.labels_, self.inertia_ = _labels_inertia(
-                X, sample_weight, x_squared_norms, self.cluster_centers_)
+                X, sample_weight, x_squared_norms, self.cluster_centers_,
+                n_threads=self._n_threads)
 
         return self
 

From fcc2718e507e22397504516428513e999f5bf1c0 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 17 Jun 2020 15:49:52 +0200
Subject: [PATCH 24/72] wip

---
 sklearn/cluster/_k_means_minibatch.pyx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index ec5b98f201346..49af1c7426d0a 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -43,7 +43,7 @@ def _minibatch_update_dense(
 
     weight_sums : ndarray of shape (n_clusters,), dtype=floating
         Current sums of the accumulated weights for each center.
-    
+
     labels : ndarray of shape (n_samples,), dtype=int
         labels assignment.
 
@@ -56,14 +56,14 @@ def _minibatch_update_dense(
         int i
 
         int *indices
-    
+
     with nogil, parallel(num_threads=n_threads):
         indices = <int*> malloc(n_samples * sizeof(int))
 
         for i in prange(n_clusters, schedule="static"):
             update_center_dense(i, &X[0, 0], sample_weight, centers_old,
                                 centers_new, weight_sums, labels, indices)
-        
+
         free(indices)
 
 
@@ -165,7 +165,7 @@ def _minibatch_update_sparse(
         int i
 
         int *indices
-    
+
     with nogil, parallel(num_threads=n_threads):
         indices = <int*> malloc(n_samples * sizeof(int))
 
@@ -173,7 +173,7 @@ def _minibatch_update_sparse(
             update_center_sparse(i, X_data, X_indices, X_indptr, sample_weight,
                                  centers_old, centers_new, weight_sums, labels,
                                  indices)
-        
+
         free(indices)
 
 

From 73f1bc2ee84dbfa541210e9b4ae449731080e3fc Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 17 Jun 2020 17:13:21 +0200
Subject: [PATCH 25/72] wip

---
 sklearn/cluster/_kmeans.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 016e2a4343ce0..7cff7e91ae153 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1601,13 +1601,10 @@ def fit(self, X, y=None, sample_weight=None):
                 X, x_squared_norms=x_squared_norms, init=init,
                 random_state=random_state, init_size=self._init_size)
 
-            # Preform one iteration of KMeans to make the centers being the
-            # mean of their cluster.
-            labels, inertia, cluster_centers, _ = _kmeans_single_lloyd(
-                X=X_valid, x_squared_norms=x_squared_norms_valid,
-                sample_weight=sample_weight_valid,
-                centers_init=cluster_centers, max_iter=1, tol=0,
-                n_threads=self._n_threads)
+            # Compute inertia on a validation set.
+            _, inertia = _labels_inertia(
+                X_valid, sample_weight_valid, x_squared_norms_valid,
+                cluster_centers, n_threads=self._n_threads)
 
             if self.verbose:
                 print(f"Inertia for init {init_idx + 1}/{self._n_init}: "

From 7325c89a586eb57afabd2c25d2d88688783628b8 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 17 Jun 2020 18:17:07 +0200
Subject: [PATCH 26/72] wip

---
 sklearn/cluster/tests/test_k_means.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 4be91c1d52bf5..4d55f512ab45e 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -347,6 +347,17 @@ def test_fit_transform(estimator):
     assert_allclose(X1, X2)
 
 
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_sample_weight_unchanged(estimator):
+    # Check that sample_weight is not modified in place by KMeans (#17204)
+    X = np.array([[1], [2], [4]])
+    sample_weight = np.array([0.5, 0.2, 0.3])
+    estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight)
+
+    # internally, sample_weight is rescale to sum up to n_samples = 3
+    assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3]))
+
+
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 def test_verbose(estimator):
     # Check verbose mode of KMeans and MiniBatchKMeans for better coverage.
@@ -893,13 +904,3 @@ def test_n_jobs_deprecated(n_jobs):
 
     with pytest.warns(FutureWarning, match=depr_msg):
         kmeans.fit(X)
-
-
-def test_sample_weight_unchanged():
-    # Check that sample_weight is not modified in place by KMeans (#17204)
-    X = np.array([[1], [2], [4]])
-    sample_weight = np.array([0.5, 0.2, 0.3])
-    KMeans(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight)
-
-    # internally, sample_weight is rescale to sum up to n_samples = 3
-    assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3]))

From a82456632219c11fa027a3a2299d0495c47c915f Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 18 Jun 2020 13:11:48 +0200
Subject: [PATCH 27/72] wip

---
 sklearn/cluster/_kmeans.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 7cff7e91ae153..ea097caa61a35 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1381,10 +1381,10 @@ class MiniBatchKMeans(KMeans):
     ...                          batch_size=6,
     ...                          max_iter=10).fit(X)
     >>> kmeans.cluster_centers_
-    array([[3.95918367, 2.40816327],
-           [1.12195122, 1.3902439 ]])
+    array([[2.32394366, 1.16901408],
+           [3.4       , 4.36      ]])
     >>> kmeans.predict([[0, 0], [4, 4]])
-    array([1, 0], dtype=int32)
+    array([0, 1], dtype=int32)
     """
     @_deprecate_positional_args
     def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100,

From a4edafb8b046482ecab939cb30d80cff7b27b349 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 6 Jul 2020 13:54:31 +0200
Subject: [PATCH 28/72] reduce diff

---
 sklearn/cluster/tests/test_k_means.py | 172 +++++++++++++-------------
 1 file changed, 86 insertions(+), 86 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 4d55f512ab45e..55074307a3ba5 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -57,6 +57,92 @@ def _check_fitted_model(km):
     assert km.inertia_ > 0.0
 
 
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("algo", ["full", "elkan"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_kmeans_results(array_constr, algo, dtype):
+    # Checks that KMeans works as intended on toy dataset by comparing with
+    # expected results computed by hand.
+    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
+    sample_weight = [3, 1, 1, 3]
+    init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)
+
+    expected_labels = [0, 0, 1, 1]
+    expected_inertia = 0.375
+    expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
+    expected_n_iter = 2
+
+    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
+    kmeans.fit(X, sample_weight=sample_weight)
+
+    assert_array_equal(kmeans.labels_, expected_labels)
+    assert_allclose(kmeans.inertia_, expected_inertia)
+    assert_allclose(kmeans.cluster_centers_, expected_centers)
+    assert kmeans.n_iter_ == expected_n_iter
+
+
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+def test_relocate_empty_clusters(array_constr):
+    # test for the _relocate_empty_clusters_(dense/sparse) helpers
+
+    # Synthetic dataset with 3 obvious clusters of different sizes
+    X = np.array(
+        [-10., -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1)
+    X = array_constr(X)
+    sample_weight = np.ones(10)
+
+    # centers all initialized to the first point of X
+    centers_old = np.array([-10., -10, -10]).reshape(-1, 1)
+
+    # With this initialization, all points will be assigned to the first center
+    # At this point a center in centers_new is the weighted sum of the points
+    # it contains if it's not empty, otherwise it is the same as before.
+    centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1)
+    weight_in_clusters = np.array([10., 0, 0])
+    labels = np.zeros(10, dtype=np.int32)
+
+    if array_constr is np.array:
+        _relocate_empty_clusters_dense(X, sample_weight, centers_old,
+                                       centers_new, weight_in_clusters, labels)
+    else:
+        _relocate_empty_clusters_sparse(X.data, X.indices, X.indptr,
+                                        sample_weight, centers_old,
+                                        centers_new, weight_in_clusters,
+                                        labels)
+
+    # The relocation scheme will take the 2 points farthest from the center and
+    # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The
+    # first center will be updated to contain the other 8 points.
+    assert_array_equal(weight_in_clusters, [8, 1, 1])
+    assert_allclose(centers_new, [[-36], [10], [9.5]])
+
+
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("algo", ["full", "elkan"])
+def test_kmeans_relocated_clusters(array_constr, algo):
+    # check that empty clusters are relocated as expected
+    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]])
+
+    # second center too far from others points will be empty at first iter
+    init_centers = np.array([[0.5, 0.5], [3, 3]])
+
+    expected_labels = [0, 0, 1, 1]
+    expected_inertia = 0.25
+    expected_centers = [[0.25, 0], [0.75, 1]]
+    expected_n_iter = 3
+
+    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
+    kmeans.fit(X)
+
+    assert_array_equal(kmeans.labels_, expected_labels)
+    assert_allclose(kmeans.inertia_, expected_inertia)
+    assert_allclose(kmeans.cluster_centers_, expected_centers)
+    assert kmeans.n_iter_ == expected_n_iter
+
+
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 @pytest.mark.parametrize("init", ["random", "k-means++", centers,
                                   lambda X, k, random_state: centers],
@@ -370,31 +456,6 @@ def test_verbose(estimator):
         sys.stdout = old_stdout
 
 
-@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
-                         ids=["dense", "sparse"])
-@pytest.mark.parametrize("algo", ["full", "elkan"])
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_kmeans_results(array_constr, algo, dtype):
-    # Checks that KMeans works as intended on toy dataset by comparing with
-    # expected results computed by hand.
-    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
-    sample_weight = [3, 1, 1, 3]
-    init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)
-
-    expected_labels = [0, 0, 1, 1]
-    expected_inertia = 0.375
-    expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
-    expected_n_iter = 2
-
-    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
-    kmeans.fit(X, sample_weight=sample_weight)
-
-    assert_array_equal(kmeans.labels_, expected_labels)
-    assert_allclose(kmeans.inertia_, expected_inertia)
-    assert_allclose(kmeans.cluster_centers_, expected_centers)
-    assert kmeans.n_iter_ == expected_n_iter
-
-
 @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
                          ids=["dense", "sparse"])
 @pytest.mark.parametrize("algo", ["full", "elkan"])
@@ -497,30 +558,6 @@ def test_kmeans_elkan_iter_attribute():
     assert km.n_iter_ == 1
 
 
-@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
-                         ids=["dense", "sparse"])
-@pytest.mark.parametrize("algo", ["full", "elkan"])
-def test_kmeans_relocated_clusters(array_constr, algo):
-    # check that empty clusters are relocated as expected
-    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]])
-
-    # second center too far from others points will be empty at first iter
-    init_centers = np.array([[0.5, 0.5], [3, 3]])
-
-    expected_labels = [0, 0, 1, 1]
-    expected_inertia = 0.25
-    expected_centers = [[0.25, 0], [0.75, 1]]
-    expected_n_iter = 3
-
-    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
-    kmeans.fit(X)
-
-    assert_array_equal(kmeans.labels_, expected_labels)
-    assert_allclose(kmeans.inertia_, expected_inertia)
-    assert_allclose(kmeans.cluster_centers_, expected_centers)
-    assert kmeans.n_iter_ == expected_n_iter
-
-
 @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
                          ids=["dense", "sparse"])
 def test_kmeans_empty_cluster_relocated(array_constr):
@@ -537,43 +574,6 @@ def test_kmeans_empty_cluster_relocated(array_constr):
     assert_allclose(km.cluster_centers_, [[-1], [1]])
 
 
-@pytest.mark.parametrize("representation", ["dense", "sparse"])
-def test_relocate_empty_clusters(representation):
-    # test for the _relocate_empty_clusters_(dense/sparse) helpers
-
-    # Synthetic dataset with 3 obvious clusters of different sizes
-    X = np.array(
-        [-10., -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1)
-    if representation == "sparse":
-        X = sp.csr_matrix(X)
-    sample_weight = np.ones(10)
-
-    # centers all initialized to the first point of X
-    centers_old = np.array([-10., -10, -10]).reshape(-1, 1)
-
-    # With this initialization, all points will be assigned to the first center
-    # At this point a center in centers_new is the weighted sum of the points
-    # it contains if it's not empty, otherwise it is the same as before.
-    centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1)
-    weight_in_clusters = np.array([10., 0, 0])
-    labels = np.zeros(10, dtype=np.int32)
-
-    if representation == "dense":
-        _relocate_empty_clusters_dense(X, sample_weight, centers_old,
-                                       centers_new, weight_in_clusters, labels)
-    else:
-        _relocate_empty_clusters_sparse(X.data, X.indices, X.indptr,
-                                        sample_weight, centers_old,
-                                        centers_new, weight_in_clusters,
-                                        labels)
-
-    # The relocation scheme will take the 2 points farthest from the center and
-    # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The
-    # first center will be updated to contain the other 8 points.
-    assert_array_equal(weight_in_clusters, [8, 1, 1])
-    assert_allclose(centers_new, [[-36], [10], [9.5]])
-
-
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("squared", [True, False])
 def test_euclidean_distance(dtype, squared):

From 0993a85eefb65b76937543b031abff699316142f Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 6 Jul 2020 13:58:00 +0200
Subject: [PATCH 29/72] reduce diff

---
 sklearn/cluster/tests/test_k_means.py | 48 +++++++++++++--------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 55074307a3ba5..9f4e60edd914d 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -82,6 +82,30 @@ def test_kmeans_results(array_constr, algo, dtype):
     assert kmeans.n_iter_ == expected_n_iter
 
 
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("algo", ["full", "elkan"])
+def test_kmeans_relocated_clusters(array_constr, algo):
+    # check that empty clusters are relocated as expected
+    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]])
+
+    # second center too far from others points will be empty at first iter
+    init_centers = np.array([[0.5, 0.5], [3, 3]])
+
+    expected_labels = [0, 0, 1, 1]
+    expected_inertia = 0.25
+    expected_centers = [[0.25, 0], [0.75, 1]]
+    expected_n_iter = 3
+
+    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
+    kmeans.fit(X)
+
+    assert_array_equal(kmeans.labels_, expected_labels)
+    assert_allclose(kmeans.inertia_, expected_inertia)
+    assert_allclose(kmeans.cluster_centers_, expected_centers)
+    assert kmeans.n_iter_ == expected_n_iter
+
+
 @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
                          ids=["dense", "sparse"])
 def test_relocate_empty_clusters(array_constr):
@@ -119,30 +143,6 @@ def test_relocate_empty_clusters(array_constr):
     assert_allclose(centers_new, [[-36], [10], [9.5]])
 
 
-@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
-                         ids=["dense", "sparse"])
-@pytest.mark.parametrize("algo", ["full", "elkan"])
-def test_kmeans_relocated_clusters(array_constr, algo):
-    # check that empty clusters are relocated as expected
-    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]])
-
-    # second center too far from others points will be empty at first iter
-    init_centers = np.array([[0.5, 0.5], [3, 3]])
-
-    expected_labels = [0, 0, 1, 1]
-    expected_inertia = 0.25
-    expected_centers = [[0.25, 0], [0.75, 1]]
-    expected_n_iter = 3
-
-    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
-    kmeans.fit(X)
-
-    assert_array_equal(kmeans.labels_, expected_labels)
-    assert_allclose(kmeans.inertia_, expected_inertia)
-    assert_allclose(kmeans.cluster_centers_, expected_centers)
-    assert kmeans.n_iter_ == expected_n_iter
-
-
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 @pytest.mark.parametrize("init", ["random", "k-means++", centers,
                                   lambda X, k, random_state: centers],

From b3089615f233cb0df64efd006362305ee8362e36 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 6 Jul 2020 14:05:58 +0200
Subject: [PATCH 30/72] reduce diff

---
 sklearn/cluster/tests/test_k_means.py | 88 +++++++++++++--------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 9f4e60edd914d..e555c2ba9fcf3 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -143,6 +143,50 @@ def test_relocate_empty_clusters(array_constr):
     assert_allclose(centers_new, [[-36], [10], [9.5]])
 
 
+@pytest.mark.parametrize("distribution", ["normal", "blobs"])
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8])
+def test_kmeans_elkan_results(distribution, array_constr, tol):
+    # Check that results are identical between lloyd and elkan algorithms
+    rnd = np.random.RandomState(0)
+    if distribution == "normal":
+        X = rnd.normal(size=(5000, 10))
+    else:
+        X, _ = make_blobs(random_state=rnd)
+    X[X < 0] = 0
+    X = array_constr(X)
+
+    km_full = KMeans(algorithm="full", n_clusters=5,
+                     random_state=0, n_init=1, tol=tol)
+    km_elkan = KMeans(algorithm="elkan", n_clusters=5,
+                      random_state=0, n_init=1, tol=tol)
+
+    km_full.fit(X)
+    km_elkan.fit(X)
+    assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_)
+    assert_array_equal(km_elkan.labels_, km_full.labels_)
+    assert km_elkan.n_iter_ == km_full.n_iter_
+    assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6)
+
+
+@pytest.mark.parametrize("algorithm", ["full", "elkan"])
+def test_kmeans_convergence(algorithm):
+    # Check that KMeans stops when convergence is reached when tol=0. (#16075)
+    # We can only ensure that if the number of threads is not to large,
+    # otherwise the roundings errors coming from the unpredictability of
+    # the order in which chunks are processed make the convergence criterion
+    # to never be exactly 0.
+    rnd = np.random.RandomState(0)
+    X = rnd.normal(size=(5000, 10))
+
+    with threadpool_limits(limits=1, user_api="openmp"):
+        km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0,
+                    n_init=1, tol=0, max_iter=300).fit(X)
+
+    assert km.n_iter_ < 300
+
+
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 @pytest.mark.parametrize("init", ["random", "k-means++", centers,
                                   lambda X, k, random_state: centers],
@@ -485,50 +529,6 @@ def py_kmeans(X, init):
     assert_allclose(py_centers, cy_centers)
 
 
-@pytest.mark.parametrize("distribution", ["normal", "blobs"])
-@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
-                         ids=["dense", "sparse"])
-@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8])
-def test_kmeans_elkan_results(distribution, array_constr, tol):
-    # Check that results are identical between lloyd and elkan algorithms
-    rnd = np.random.RandomState(0)
-    if distribution == "normal":
-        X = rnd.normal(size=(5000, 10))
-    else:
-        X, _ = make_blobs(random_state=rnd)
-    X[X < 0] = 0
-    X = array_constr(X)
-
-    km_full = KMeans(algorithm="full", n_clusters=5,
-                     random_state=0, n_init=1, tol=tol)
-    km_elkan = KMeans(algorithm="elkan", n_clusters=5,
-                      random_state=0, n_init=1, tol=tol)
-
-    km_full.fit(X)
-    km_elkan.fit(X)
-    assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_)
-    assert_array_equal(km_elkan.labels_, km_full.labels_)
-    assert km_elkan.n_iter_ == km_full.n_iter_
-    assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6)
-
-
-@pytest.mark.parametrize("algorithm", ["full", "elkan"])
-def test_kmeans_convergence(algorithm):
-    # Check that KMeans stops when convergence is reached when tol=0. (#16075)
-    # We can only ensure that if the number of threads is not to large,
-    # otherwise the roundings errors coming from the unpredictability of
-    # the order in which chunks are processed make the convergence criterion
-    # to never be exactly 0.
-    rnd = np.random.RandomState(0)
-    X = rnd.normal(size=(5000, 10))
-
-    with threadpool_limits(limits=1, user_api="openmp"):
-        km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0,
-                    n_init=1, tol=0, max_iter=300).fit(X)
-
-    assert km.n_iter_ < 300
-
-
 def test_kmeans_copyx():
     # Check that copy_x=False returns nearly equal X after de-centering.
     my_X = X.copy()

From 121450b3df8cb5887641ccd5c3c20e3bd8ee813f Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 6 Jul 2020 15:51:27 +0200
Subject: [PATCH 31/72] reduce diff

---
 sklearn/cluster/tests/test_k_means.py | 114 +++++++++++++-------------
 1 file changed, 57 insertions(+), 57 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index e555c2ba9fcf3..031567b9116d3 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -187,6 +187,63 @@ def test_kmeans_convergence(algorithm):
     assert km.n_iter_ < 300
 
 
+def test_minibatch_update_consistency():
+    # Check that dense and sparse minibatch update give the same results
+    rng = np.random.RandomState(42)
+
+    centers_old = centers + rng.normal(size=centers.shape)
+    centers_old_csr = centers_old.copy()
+
+    centers_new = np.zeros_like(centers_old)
+    centers_new_csr = np.zeros_like(centers_old_csr)
+
+    weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype)
+    weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype)
+
+    x_squared_norms = (X ** 2).sum(axis=1)
+    x_squared_norms_csr = row_norms(X_csr, squared=True)
+
+    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
+
+    # extract a small minibatch
+    X_mb = X[:10]
+    X_mb_csr = X_csr[:10]
+    x_mb_squared_norms = x_squared_norms[:10]
+    x_mb_squared_norms_csr = x_squared_norms_csr[:10]
+    sample_weight_mb = sample_weight[:10]
+
+    # step 1: compute the dense minibatch update
+    old_inertia = _mini_batch_step(
+        X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new,
+        weight_sums, np.random.RandomState(0), random_reassign=False)
+    assert old_inertia > 0.0
+
+    # compute the new inertia on the same batch to check that it decreased
+    labels, new_inertia = _labels_inertia(
+        X_mb, sample_weight_mb, x_mb_squared_norms, centers_new)
+    assert new_inertia > 0.0
+    assert new_inertia < old_inertia
+
+    # step 2: compute the sparse minibatch update
+    old_inertia_csr = _mini_batch_step(
+        X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr,
+        centers_new_csr, weight_sums_csr, np.random.RandomState(0),
+        random_reassign=False)
+    assert old_inertia_csr > 0.0
+
+    # compute the new inertia on the same batch to check that it decreased
+    labels_csr, new_inertia_csr = _labels_inertia(
+        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr)
+    assert new_inertia_csr > 0.0
+    assert new_inertia_csr < old_inertia_csr
+
+    # step 3: check that sparse and dense updates lead to the same results
+    assert_array_equal(labels, labels_csr)
+    assert_allclose(centers_new, centers_new_csr)
+    assert_allclose(old_inertia, old_inertia_csr)
+    assert_allclose(new_inertia, new_inertia_csr)
+
+
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 @pytest.mark.parametrize("init", ["random", "k-means++", centers,
                                   lambda X, k, random_state: centers],
@@ -635,63 +692,6 @@ def test_k_means_function():
     assert inertia > 0.0
 
 
-def test_minibatch_update_consistency():
-    # Check that dense and sparse minibatch update give the same results
-    rng = np.random.RandomState(42)
-
-    centers_old = centers + rng.normal(size=centers.shape)
-    centers_old_csr = centers_old.copy()
-
-    centers_new = np.zeros_like(centers_old)
-    centers_new_csr = np.zeros_like(centers_old_csr)
-
-    weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype)
-    weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype)
-
-    x_squared_norms = (X ** 2).sum(axis=1)
-    x_squared_norms_csr = row_norms(X_csr, squared=True)
-
-    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
-
-    # extract a small minibatch
-    X_mb = X[:10]
-    X_mb_csr = X_csr[:10]
-    x_mb_squared_norms = x_squared_norms[:10]
-    x_mb_squared_norms_csr = x_squared_norms_csr[:10]
-    sample_weight_mb = sample_weight[:10]
-
-    # step 1: compute the dense minibatch update
-    old_inertia = _mini_batch_step(
-        X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new,
-        weight_sums, np.random.RandomState(0), random_reassign=False)
-    assert old_inertia > 0.0
-
-    # compute the new inertia on the same batch to check that it decreased
-    labels, new_inertia = _labels_inertia(
-        X_mb, sample_weight_mb, x_mb_squared_norms, centers_new)
-    assert new_inertia > 0.0
-    assert new_inertia < old_inertia
-
-    # step 2: compute the sparse minibatch update
-    old_inertia_csr = _mini_batch_step(
-        X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr,
-        centers_new_csr, weight_sums_csr, np.random.RandomState(0),
-        random_reassign=False)
-    assert old_inertia_csr > 0.0
-
-    # compute the new inertia on the same batch to check that it decreased
-    labels_csr, new_inertia_csr = _labels_inertia(
-        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr)
-    assert new_inertia_csr > 0.0
-    assert new_inertia_csr < old_inertia_csr
-
-    # step 3: check that sparse and dense updates lead to the same results
-    assert_array_equal(labels, labels_csr)
-    assert_allclose(centers_new, centers_new_csr)
-    assert_allclose(old_inertia, old_inertia_csr)
-    assert_allclose(new_inertia, new_inertia_csr)
-
-
 def test_minibatch_kmeans_init_size():
     # Check the internal _init_size attribute of MiniBatchKMeans
 

From 6c67dd10a6be4c8ce12af910ec0cca5eac2f7294 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 8 Jul 2020 16:48:38 +0200
Subject: [PATCH 32/72] reduce diff

---
 sklearn/cluster/tests/test_k_means.py | 372 +++++++++++++-------------
 1 file changed, 186 insertions(+), 186 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 071a7ab213bdd..a58e648b70692 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -258,6 +258,192 @@ def test_all_init(Estimator, data, init):
     _check_fitted_model(km)
 
 
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_fortran_aligned_data(estimator):
+    # Check that KMeans works with fortran-aligned data.
+    X_fortran = np.asfortranarray(X)
+    centers_fortran = np.asfortranarray(centers)
+
+    km_c = estimator(n_clusters=n_clusters, init=centers, n_init=1,
+                     random_state=42).fit(X)
+    km_f = estimator(n_clusters=n_clusters, init=centers_fortran, n_init=1,
+                     random_state=42).fit(X_fortran)
+    assert_allclose(km_c.cluster_centers_, km_f.cluster_centers_)
+    assert_array_equal(km_c.labels_, km_f.labels_)
+
+
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_verbose(estimator):
+    # Check verbose mode of KMeans and MiniBatchKMeans for better coverage.
+    km = estimator(n_clusters=n_clusters, random_state=42, verbose=1)
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        km.fit(X)
+    finally:
+        sys.stdout = old_stdout
+
+
+def test_minibatch_sensible_reassign():
+    # check that identical initial clusters are reassigned
+    # also a regression test for when there are more desired reassignments than
+    # samples.
+    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
+                                       random_state=42)
+    zeroed_X[::2, :] = 0
+
+    km = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
+                         init="random").fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    assert km.cluster_centers_.any(axis=1).sum() > 10
+
+    # do the same with batch-size > X.shape[0] (regression test)
+    km = MiniBatchKMeans(n_clusters=20, batch_size=200, random_state=42,
+                         init="random").fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    assert km.cluster_centers_.any(axis=1).sum() > 10
+
+    # do the same with partial_fit API
+    km = MiniBatchKMeans(n_clusters=20, random_state=42, init="random")
+    for i in range(100):
+        km.partial_fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    assert km.cluster_centers_.any(axis=1).sum() > 10
+
+
+@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
+def test_minibatch_reassign(data):
+    # Check the reassignment part of the minibatch step with very high or very
+    # low reassignment ratio.
+    perfect_centers = np.empty((n_clusters, n_features))
+    for i in range(n_clusters):
+        perfect_centers[i] = X[true_labels == i].mean(axis=0)
+
+    x_squared_norms = row_norms(data, squared=True)
+    sample_weight = np.ones(n_samples)
+    centers_new = np.empty_like(perfect_centers)
+
+    # Give a perfect initialization, but a large reassignment_ratio, as a
+    # result many centers should be reassigned and the model should no longer
+    # be good
+    score_before = - _labels_inertia(data, sample_weight, x_squared_norms,
+                                     perfect_centers, 1)[1]
+
+    _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers,
+                     centers_new, np.zeros(n_clusters),
+                     np.random.RandomState(0), random_reassign=True,
+                     reassignment_ratio=1)
+
+    score_after = - _labels_inertia(data, sample_weight, x_squared_norms,
+                                    centers_new, 1)[1]
+
+    assert score_before > score_after
+
+    # Give a perfect initialization, with a small reassignment_ratio,
+    # no center should be reassigned.
+    _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers,
+                     centers_new, np.zeros(n_clusters),
+                     np.random.RandomState(0), random_reassign=True,
+                     reassignment_ratio=1e-15)
+
+    assert_allclose(centers_new, perfect_centers)
+
+
+def test_minibatch_with_many_reassignments():
+    # Test for the case that the number of clusters to reassign is bigger
+    # than the batch_size. Run the test with 100 clusters and a batch_size of
+    # 10 because it turned out that these values ensure that the number of
+    # clusters to reassign is always bigger than the batch_size.
+    MiniBatchKMeans(n_clusters=100,
+                    batch_size=10,
+                    init_size=n_samples,
+                    random_state=42,
+                    verbose=True).fit(X)
+
+
+def test_minibatch_kmeans_init_size():
+    # Check the internal _init_size attribute of MiniBatchKMeans
+
+    # default init size should be 3 * batch_size
+    km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1).fit(X)
+    assert km._init_size == 15
+
+    # if 3 * batch size < n_clusters, it should then be 3 * n_clusters
+    km = MiniBatchKMeans(n_clusters=10, batch_size=1, n_init=1).fit(X)
+    assert km._init_size == 30
+
+    # it should not be larger than n_samples
+    km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1,
+                         init_size=n_samples + 1).fit(X)
+    assert km._init_size == n_samples
+
+
+def test_kmeans_copyx():
+    # Check that copy_x=False returns nearly equal X after de-centering.
+    my_X = X.copy()
+    km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42)
+    km.fit(my_X)
+    _check_fitted_model(km)
+
+    # check that my_X is de-centered
+    assert_allclose(my_X, X)
+
+
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_score_max_iter(estimator):
+    # Check that fitting KMeans or MiniBatchKMeans with more iterations gives
+    # better score
+    X = np.random.RandomState(0).randn(100, 10)
+
+    km1 = estimator(n_init=1, random_state=42, max_iter=1)
+    s1 = km1.fit(X).score(X)
+    km2 = estimator(n_init=1, random_state=42, max_iter=10)
+    s2 = km2.fit(X).score(X)
+    assert s2 > s1
+
+
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("init", ["random", "k-means++", "ndarray"])
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_predict(estimator, init, dtype, array_constr):
+    # Check the predict method and the equivalence between fit.predict and
+    # fit_predict.
+    if sys.platform == "darwin":
+        pytest.xfail(
+            "Known failures on MacOS, See "
+            "https://github.com/scikit-learn/scikit-learn/issues/12644")
+
+    X, _ = make_blobs(n_samples=500, n_features=10, centers=10, random_state=0)
+
+    n_init = 1 if init == "ndarray" else 10
+    init = X[:10] if init == "ndarray" else init
+    X = array_constr(X)
+
+    km = estimator(n_clusters=10, init=init, n_init=n_init,
+                   random_state=0).fit(X)
+    labels = km.labels_
+
+    # Due to randomness in the order in which chunks of data are processed when
+    # using more than one thread, there might be different rounding errors for
+    # the computation of the inertia for each init between 2 runs. This might
+    # result in a different ranking of the inits, hence a different labeling,
+    # which should still correspond to the same clustering
+
+    # re-predict labels for training set using predict
+    pred = km.predict(X)
+    assert_allclose(v_measure_score(pred, labels), 1)
+
+    # re-predict labels for training set using fit_predict
+    pred = km.fit_predict(X)
+    assert_allclose(v_measure_score(pred, labels), 1)
+
+    # predict centroid labels
+    pred = km.predict(km.cluster_centers_)
+    assert_allclose(v_measure_score(pred, np.arange(10)), 1)
+
+
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 def test_result_equal_in_diff_n_threads(estimator):
     # Check that KMeans/MiniBatchKMeans give the same results in parallel mode
@@ -340,20 +526,6 @@ def test_dense_sparse(estimator):
     assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_)
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_fortran_aligned_data(estimator):
-    # Check that KMeans works with fortran-aligned data.
-    X_fortran = np.asfortranarray(X)
-    centers_fortran = np.asfortranarray(centers)
-
-    km_c = estimator(n_clusters=n_clusters, init=centers, n_init=1,
-                     random_state=42).fit(X)
-    km_f = estimator(n_clusters=n_clusters, init=centers_fortran, n_init=1,
-                     random_state=42).fit(X_fortran)
-    assert_allclose(km_c.cluster_centers_, km_f.cluster_centers_)
-    assert_array_equal(km_c.labels_, km_f.labels_)
-
-
 @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 def test_centers_not_mutated(estimator, dtype):
@@ -405,19 +577,6 @@ def test_float_precision(Estimator, data):
     assert_array_equal(labels[np.float32], labels[np.float64])
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_score_max_iter(estimator):
-    # Check that fitting KMeans or MiniBatchKMeans with more iterations gives
-    # better score
-    X = np.random.RandomState(0).randn(100, 10)
-
-    km1 = estimator(n_init=1, random_state=42, max_iter=1)
-    s1 = km1.fit(X).score(X)
-    km2 = estimator(n_init=1, random_state=42, max_iter=10)
-    s2 = km2.fit(X).score(X)
-    assert s2 > s1
-
-
 @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
                          ids=["dense", "sparse"])
 @pytest.mark.parametrize("dtype", [np.int32, np.int64])
@@ -449,48 +608,6 @@ def test_integer_input(estimator, array_constr, dtype, init):
         assert km.cluster_centers_.dtype == np.float64
 
 
-@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
-                         ids=["dense", "sparse"])
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
-@pytest.mark.parametrize("init", ["random", "k-means++", "ndarray"])
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_predict(estimator, init, dtype, array_constr):
-    # Check the predict method and the equivalence between fit.predict and
-    # fit_predict.
-    if sys.platform == "darwin":
-        pytest.xfail(
-            "Known failures on MacOS, See "
-            "https://github.com/scikit-learn/scikit-learn/issues/12644")
-
-    X, _ = make_blobs(n_samples=500, n_features=10, centers=10, random_state=0)
-
-    n_init = 1 if init == "ndarray" else 10
-    init = X[:10] if init == "ndarray" else init
-    X = array_constr(X)
-
-    km = estimator(n_clusters=10, init=init, n_init=n_init,
-                   random_state=0).fit(X)
-    labels = km.labels_
-
-    # Due to randomness in the order in which chunks of data are processed when
-    # using more than one thread, there might be different rounding errors for
-    # the computation of the inertia for each init between 2 runs. This might
-    # result in a different ranking of the inits, hence a different labeling,
-    # which should still correspond to the same clustering
-
-    # re-predict labels for training set using predict
-    pred = km.predict(X)
-    assert_allclose(v_measure_score(pred, labels), 1)
-
-    # re-predict labels for training set using fit_predict
-    pred = km.fit_predict(X)
-    assert_allclose(v_measure_score(pred, labels), 1)
-
-    # predict centroid labels
-    pred = km.predict(km.cluster_centers_)
-    assert_allclose(v_measure_score(pred, np.arange(10)), 1)
-
-
 @pytest.mark.parametrize("init", ["random", "k-means++", centers],
                          ids=["random", "k-means++", "ndarray"])
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
@@ -545,18 +662,6 @@ def test_sample_weight_unchanged(estimator):
     assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3]))
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_verbose(estimator):
-    # Check verbose mode of KMeans and MiniBatchKMeans for better coverage.
-    km = estimator(n_clusters=n_clusters, random_state=42, verbose=1)
-    old_stdout = sys.stdout
-    sys.stdout = StringIO()
-    try:
-        km.fit(X)
-    finally:
-        sys.stdout = old_stdout
-
-
 @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
                          ids=["dense", "sparse"])
 @pytest.mark.parametrize("algo", ["full", "elkan"])
@@ -586,17 +691,6 @@ def py_kmeans(X, init):
     assert_allclose(py_centers, cy_centers)
 
 
-def test_kmeans_copyx():
-    # Check that copy_x=False returns nearly equal X after de-centering.
-    my_X = X.copy()
-    km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42)
-    km.fit(my_X)
-    _check_fitted_model(km)
-
-    # check that my_X is de-centered
-    assert_allclose(my_X, X)
-
-
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 def test_kmeans_init_fitted_centers(data):
     # Check that starting fitting from a local optimum shouldn't change the
@@ -692,100 +786,6 @@ def test_k_means_function():
     assert inertia > 0.0
 
 
-def test_minibatch_kmeans_init_size():
-    # Check the internal _init_size attribute of MiniBatchKMeans
-
-    # default init size should be 3 * batch_size
-    km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1).fit(X)
-    assert km._init_size == 15
-
-    # if 3 * batch size < n_clusters, it should then be 3 * n_clusters
-    km = MiniBatchKMeans(n_clusters=10, batch_size=1, n_init=1).fit(X)
-    assert km._init_size == 30
-
-    # it should not be larger than n_samples
-    km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1,
-                         init_size=n_samples + 1).fit(X)
-    assert km._init_size == n_samples
-
-
-def test_minibatch_sensible_reassign():
-    # check that identical initial clusters are reassigned
-    # also a regression test for when there are more desired reassignments than
-    # samples.
-    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
-                                       random_state=42)
-    zeroed_X[::2, :] = 0
-
-    km = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
-                         init="random").fit(zeroed_X)
-    # there should not be too many exact zero cluster centers
-    assert km.cluster_centers_.any(axis=1).sum() > 10
-
-    # do the same with batch-size > X.shape[0] (regression test)
-    km = MiniBatchKMeans(n_clusters=20, batch_size=200, random_state=42,
-                         init="random").fit(zeroed_X)
-    # there should not be too many exact zero cluster centers
-    assert km.cluster_centers_.any(axis=1).sum() > 10
-
-    # do the same with partial_fit API
-    km = MiniBatchKMeans(n_clusters=20, random_state=42, init="random")
-    for i in range(100):
-        km.partial_fit(zeroed_X)
-    # there should not be too many exact zero cluster centers
-    assert km.cluster_centers_.any(axis=1).sum() > 10
-
-
-@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
-def test_minibatch_reassign(data):
-    # Check the reassignment part of the minibatch step with very high or very
-    # low reassignment ratio.
-    perfect_centers = np.empty((n_clusters, n_features))
-    for i in range(n_clusters):
-        perfect_centers[i] = X[true_labels == i].mean(axis=0)
-
-    x_squared_norms = row_norms(data, squared=True)
-    sample_weight = np.ones(n_samples)
-    centers_new = np.empty_like(perfect_centers)
-
-    # Give a perfect initialization, but a large reassignment_ratio, as a
-    # result many centers should be reassigned and the model should no longer
-    # be good
-    score_before = - _labels_inertia(data, sample_weight, x_squared_norms,
-                                     perfect_centers, 1)[1]
-
-    _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers,
-                     centers_new, np.zeros(n_clusters),
-                     np.random.RandomState(0), random_reassign=True,
-                     reassignment_ratio=1)
-
-    score_after = - _labels_inertia(data, sample_weight, x_squared_norms,
-                                    centers_new, 1)[1]
-
-    assert score_before > score_after
-
-    # Give a perfect initialization, with a small reassignment_ratio,
-    # no center should be reassigned.
-    _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers,
-                     centers_new, np.zeros(n_clusters),
-                     np.random.RandomState(0), random_reassign=True,
-                     reassignment_ratio=1e-15)
-
-    assert_allclose(centers_new, perfect_centers)
-
-
-def test_minibatch_with_many_reassignments():
-    # Test for the case that the number of clusters to reassign is bigger
-    # than the batch_size. Run the test with 100 clusters and a batch_size of
-    # 10 because it turned out that these values ensure that the number of
-    # clusters to reassign is always bigger than the batch_size.
-    MiniBatchKMeans(n_clusters=100,
-                    batch_size=10,
-                    init_size=n_samples,
-                    random_state=42,
-                    verbose=True).fit(X)
-
-
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 @pytest.mark.parametrize("param, match", [
     ({"n_init": 0}, r"n_init should be > 0"),

From 6e78c7ed2f9c4c4dae901678728061a28a83f587 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 9 Jul 2020 14:33:35 +0200
Subject: [PATCH 33/72] reduce diff

---
 sklearn/cluster/tests/test_k_means.py | 99 ++++++++++++++-------------
 1 file changed, 50 insertions(+), 49 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index a58e648b70692..2366a4f0b4156 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -464,55 +464,6 @@ def _sort_centers(centers):
     return np.sort(centers, axis=0)
 
 
-def test_weighted_vs_repeated():
-    # Check that a sample weight of N should yield the same result as an N-fold
-    # repetition of the sample. Valid only if init is precomputed, otherwise
-    # rng produces different results. Not valid for MinibatchKMeans due to rng
-    # to extract minibatches.
-    sample_weight = np.random.RandomState(0).randint(1, 5, size=n_samples)
-    X_repeat = np.repeat(X, sample_weight, axis=0)
-
-    km = KMeans(init=centers, n_init=1, n_clusters=n_clusters, random_state=0)
-
-    km_weighted = clone(km).fit(X, sample_weight=sample_weight)
-    repeated_labels = np.repeat(km_weighted.labels_, sample_weight)
-    km_repeated = clone(km).fit(X_repeat)
-
-    assert_array_equal(km_repeated.labels_, repeated_labels)
-    assert_allclose(km_weighted.inertia_, km_repeated.inertia_)
-    assert_allclose(_sort_centers(km_weighted.cluster_centers_),
-                    _sort_centers(km_repeated.cluster_centers_))
-
-
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_unit_weights_vs_no_weights(estimator):
-    # Check that not passing sample weights should be equivalent to passing
-    # sample weights all equal to one.
-    sample_weight = np.ones(n_samples)
-
-    km = estimator(n_clusters=n_clusters, random_state=42, n_init=1)
-    km_none = clone(km).fit(X, sample_weight=None)
-    km_ones = clone(km).fit(X, sample_weight=sample_weight)
-
-    assert_array_equal(km_none.labels_, km_ones.labels_)
-    assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_)
-
-
-@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_scaled_weights(estimator, data):
-    # Check that scaling all sample weights by a common factor
-    # shouldn't change the result
-    sample_weight = np.random.uniform(n_samples)
-
-    km = estimator(n_clusters=n_clusters, random_state=42, n_init=1)
-    km_orig = clone(km).fit(data, sample_weight=sample_weight)
-    km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight)
-
-    assert_array_equal(km_orig.labels_, km_scaled.labels_)
-    assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_)
-
-
 @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
 def test_dense_sparse(estimator):
     # Check that the results are the same for dense and sparse input.
@@ -577,6 +528,56 @@ def test_float_precision(Estimator, data):
     assert_array_equal(labels[np.float32], labels[np.float64])
 
 
+def test_weighted_vs_repeated():
+    # Check that a sample weight of N should yield the same result as an N-fold
+    # repetition of the sample. Valid only if init is precomputed, otherwise
+    # rng produces different results. Not valid for MinibatchKMeans due to rng
+    # to extract minibatches.
+    sample_weight = np.random.RandomState(0).randint(1, 5, size=n_samples)
+    X_repeat = np.repeat(X, sample_weight, axis=0)
+
+    km = KMeans(init=centers, n_init=1, n_clusters=n_clusters, random_state=0)
+
+    km_weighted = clone(km).fit(X, sample_weight=sample_weight)
+    repeated_labels = np.repeat(km_weighted.labels_, sample_weight)
+    km_repeated = clone(km).fit(X_repeat)
+
+    assert_array_equal(km_repeated.labels_, repeated_labels)
+    assert_allclose(km_weighted.inertia_, km_repeated.inertia_)
+    assert_allclose(_sort_centers(km_weighted.cluster_centers_),
+                    _sort_centers(km_repeated.cluster_centers_))
+
+
+@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_unit_weights_vs_no_weights(estimator, data):
+    # Check that not passing sample weights should be equivalent to passing
+    # sample weights all equal to one.
+    sample_weight = np.ones(n_samples)
+
+    km = estimator(n_clusters=n_clusters, random_state=42, n_init=1)
+    km_none = clone(km).fit(data, sample_weight=None)
+    km_ones = clone(km).fit(data, sample_weight=sample_weight)
+
+    assert_array_equal(km_none.labels_, km_ones.labels_)
+    assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_)
+
+
+@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_scaled_weights(estimator, data):
+    # Check that scaling all sample weights by a common factor
+    # shouldn't change the result
+    sample_weight = np.random.uniform(n_samples)
+
+    km = estimator(n_clusters=n_clusters, random_state=42, n_init=1)
+    km_orig = clone(km).fit(data, sample_weight=sample_weight)
+    km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight)
+
+    assert_array_equal(km_orig.labels_, km_scaled.labels_)
+    assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_)
+
+
 @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
                          ids=["dense", "sparse"])
 @pytest.mark.parametrize("dtype", [np.int32, np.int64])

From f13441b8797db5793566261b6e153ff9f58eb9b0 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 9 Jul 2020 14:45:18 +0200
Subject: [PATCH 34/72] reduce diff

---
 sklearn/cluster/tests/test_k_means.py | 30 +++++++++++++--------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 2366a4f0b4156..b4a6052b8f3e3 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -477,21 +477,6 @@ def test_dense_sparse(estimator):
     assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_)
 
 
-@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_centers_not_mutated(estimator, dtype):
-    # Check that KMeans and MiniBatchKMeans won't mutate the user provided
-    # init centers silently even if input data and init centers have the same
-    # type.
-    X_new_type = X.astype(dtype, copy=True)
-    centers_new_type = centers.astype(dtype, copy=True)
-
-    km = estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1)
-    km.fit(X_new_type)
-
-    assert not np.may_share_memory(km.cluster_centers_, centers)
-
-
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
 def test_float_precision(Estimator, data):
@@ -528,6 +513,21 @@ def test_float_precision(Estimator, data):
     assert_array_equal(labels[np.float32], labels[np.float64])
 
 
+@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
+@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+def test_centers_not_mutated(estimator, dtype):
+    # Check that KMeans and MiniBatchKMeans won't mutate the user provided
+    # init centers silently even if input data and init centers have the same
+    # type.
+    X_new_type = X.astype(dtype, copy=True)
+    centers_new_type = centers.astype(dtype, copy=True)
+
+    km = estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1)
+    km.fit(X_new_type)
+
+    assert not np.may_share_memory(km.cluster_centers_, centers)
+
+
 def test_weighted_vs_repeated():
     # Check that a sample weight of N should yield the same result as an N-fold
     # repetition of the sample. Valid only if init is precomputed, otherwise

From f08d3d281c9cee6567d5affc04a2bada6279e7fe Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 10 Jul 2020 01:28:00 +0200
Subject: [PATCH 35/72] fix merge conflicts

---
 sklearn/cluster/_kmeans.py            | 4 ++--
 sklearn/cluster/tests/test_k_means.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 7cc0a1bed5b26..7662893bb5d1d 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -938,7 +938,7 @@ def fit(self, X, y=None, sample_weight=None):
         init = self.init
         if hasattr(init, '__array__'):
             init = check_array(init, dtype=X.dtype, copy=True, order='C')
-            self._validate_center_shape(X, self.n_clusters, init)
+            self._validate_center_shape(X, init)
 
         # subtract of mean of x for more accurate distance computations
         if not sp.issparse(X):
@@ -1707,7 +1707,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
                                 order='C', accept_large_sparse=False,
                                 reset=is_first_call_to_partial_fit)
 
-        self.random_state_ = getattr(self, "random_state_",
+        self._random_state = getattr(self, "_random_state",
                                      check_random_state(self.random_state))
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 2f62d1891ac23..7e09d2214a7c0 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -799,7 +799,7 @@ def test_k_means_function():
     assert inertia > 0.0
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
 @pytest.mark.parametrize("param, match", [
     ({"n_init": 0}, r"n_init should be > 0"),
     ({"max_iter": 0}, r"max_iter should be > 0"),

From b712de691f1bad3330a2d0ce22ecb427cac153ec Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 10 Jul 2020 01:29:12 +0200
Subject: [PATCH 36/72] Estimator

---
 sklearn/cluster/tests/test_k_means.py | 96 +++++++++++++--------------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 7e09d2214a7c0..934165e004619 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -284,10 +284,10 @@ def test_fortran_aligned_data(Estimator):
     assert_array_equal(km_c.labels_, km_f.labels_)
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_verbose(estimator):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_verbose(Estimator):
     # Check verbose mode of KMeans and MiniBatchKMeans for better coverage.
-    km = estimator(n_clusters=n_clusters, random_state=42, verbose=1)
+    km = Estimator(n_clusters=n_clusters, random_state=42, verbose=1)
     old_stdout = sys.stdout
     sys.stdout = StringIO()
     try:
@@ -401,15 +401,15 @@ def test_kmeans_copyx():
     assert_allclose(my_X, X)
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_score_max_iter(estimator):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_score_max_iter(Estimator):
     # Check that fitting KMeans or MiniBatchKMeans with more iterations gives
     # better score
     X = np.random.RandomState(0).randn(100, 10)
 
-    km1 = estimator(n_init=1, random_state=42, max_iter=1)
+    km1 = Estimator(n_init=1, random_state=42, max_iter=1)
     s1 = km1.fit(X).score(X)
-    km2 = estimator(n_init=1, random_state=42, max_iter=10)
+    km2 = Estimator(n_init=1, random_state=42, max_iter=10)
     s2 = km2.fit(X).score(X)
     assert s2 > s1
 
@@ -418,8 +418,8 @@ def test_score_max_iter(estimator):
                          ids=["dense", "sparse"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("init", ["random", "k-means++", "ndarray"])
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_predict(estimator, init, dtype, array_constr):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_predict(Estimator, init, dtype, array_constr):
     # Check the predict method and the equivalence between fit.predict and
     # fit_predict.
     if sys.platform == "darwin":
@@ -433,7 +433,7 @@ def test_predict(estimator, init, dtype, array_constr):
     init = X[:10] if init == "ndarray" else init
     X = array_constr(X)
 
-    km = estimator(n_clusters=10, init=init, n_init=n_init,
+    km = Estimator(n_clusters=10, init=init, n_init=n_init,
                    random_state=0).fit(X)
     labels = km.labels_
 
@@ -456,18 +456,18 @@ def test_predict(estimator, init, dtype, array_constr):
     assert_allclose(v_measure_score(pred, np.arange(10)), 1)
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_result_equal_in_diff_n_threads(estimator):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_result_equal_in_diff_n_threads(Estimator):
     # Check that KMeans/MiniBatchKMeans give the same results in parallel mode
     # than in sequential mode.
     rnd = np.random.RandomState(0)
     X = rnd.normal(size=(50, 10))
 
     with threadpool_limits(limits=1, user_api="openmp"):
-        result_1 = estimator(
+        result_1 = Estimator(
             n_clusters=n_clusters, random_state=0).fit(X).labels_
     with threadpool_limits(limits=2, user_api="openmp"):
-        result_2 = estimator(
+        result_2 = Estimator(
             n_clusters=n_clusters, random_state=0).fit(X).labels_
     assert_array_equal(result_1, result_2)
 
@@ -476,13 +476,13 @@ def _sort_centers(centers):
     return np.sort(centers, axis=0)
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_dense_sparse(estimator):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_dense_sparse(Estimator):
     # Check that the results are the same for dense and sparse input.
     sample_weight = np.random.RandomState(0).random_sample((n_samples,))
-    km_dense = estimator(n_clusters=n_clusters, random_state=0, n_init=1)
+    km_dense = Estimator(n_clusters=n_clusters, random_state=0, n_init=1)
     km_dense.fit(X, sample_weight=sample_weight)
-    km_sparse = estimator(n_clusters=n_clusters, random_state=0, n_init=1)
+    km_sparse = Estimator(n_clusters=n_clusters, random_state=0, n_init=1)
     km_sparse.fit(X_csr, sample_weight=sample_weight)
 
     assert_array_equal(km_dense.labels_, km_sparse.labels_)
@@ -526,15 +526,15 @@ def test_float_precision(Estimator, data):
 
 
 @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_centers_not_mutated(estimator, dtype):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_centers_not_mutated(Estimator, dtype):
     # Check that KMeans and MiniBatchKMeans won't mutate the user provided
     # init centers silently even if input data and init centers have the same
     # type.
     X_new_type = X.astype(dtype, copy=True)
     centers_new_type = centers.astype(dtype, copy=True)
 
-    km = estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1)
+    km = Estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1)
     km.fit(X_new_type)
 
     assert not np.may_share_memory(km.cluster_centers_, centers)
@@ -561,13 +561,13 @@ def test_weighted_vs_repeated():
 
 
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_unit_weights_vs_no_weights(estimator, data):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_unit_weights_vs_no_weights(Estimator, data):
     # Check that not passing sample weights should be equivalent to passing
     # sample weights all equal to one.
     sample_weight = np.ones(n_samples)
 
-    km = estimator(n_clusters=n_clusters, random_state=42, n_init=1)
+    km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1)
     km_none = clone(km).fit(data, sample_weight=None)
     km_ones = clone(km).fit(data, sample_weight=sample_weight)
 
@@ -576,13 +576,13 @@ def test_unit_weights_vs_no_weights(estimator, data):
 
 
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_scaled_weights(estimator, data):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_scaled_weights(Estimator, data):
     # Check that scaling all sample weights by a common factor
     # shouldn't change the result
     sample_weight = np.random.uniform(n_samples)
 
-    km = estimator(n_clusters=n_clusters, random_state=42, n_init=1)
+    km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1)
     km_orig = clone(km).fit(data, sample_weight=sample_weight)
     km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight)
 
@@ -594,8 +594,8 @@ def test_scaled_weights(estimator, data):
                          ids=["dense", "sparse"])
 @pytest.mark.parametrize("dtype", [np.int32, np.int64])
 @pytest.mark.parametrize("init", ["k-means++", "ndarray"])
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_integer_input(estimator, array_constr, dtype, init):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_integer_input(Estimator, array_constr, dtype, init):
     # Check that KMeans and MiniBatchKMeans work with integer input.
     X_dense = np.array([[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]])
     X = array_constr(X_dense, dtype=dtype)
@@ -603,8 +603,8 @@ def test_integer_input(estimator, array_constr, dtype, init):
     n_init = 1 if init == "ndarray" else 10
     init = X_dense[:2] if init == "ndarray" else init
 
-    km = estimator(n_clusters=2, init=init, n_init=n_init, random_state=0)
-    if estimator is MiniBatchKMeans:
+    km = Estimator(n_clusters=2, init=init, n_init=n_init, random_state=0)
+    if Estimator is MiniBatchKMeans:
         km.set_params(batch_size=2)
 
     km.fit(X)
@@ -616,19 +616,19 @@ def test_integer_input(estimator, array_constr, dtype, init):
     assert_allclose(v_measure_score(km.labels_, expected_labels), 1)
 
     # Same with partial_fit (#14314)
-    if estimator is MiniBatchKMeans:
+    if Estimator is MiniBatchKMeans:
         km = clone(km).partial_fit(X)
         assert km.cluster_centers_.dtype == np.float64
 
 
 @pytest.mark.parametrize("init", ["random", "k-means++", centers],
                          ids=["random", "k-means++", "ndarray"])
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_predict_dense_sparse(estimator, init):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_predict_dense_sparse(Estimator, init):
     # check that models trained on sparse input also works for dense input at
     # predict time and vice versa.
     n_init = 10 if type(init) is str else 1
-    km = estimator(n_clusters=n_clusters, init=init, n_init=n_init,
+    km = Estimator(n_clusters=n_clusters, init=init, n_init=n_init,
                    random_state=0)
 
     km.fit(X_csr)
@@ -638,10 +638,10 @@ def test_predict_dense_sparse(estimator, init):
     assert_array_equal(km.predict(X_csr), km.labels_)
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_transform(estimator):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_transform(Estimator):
     # Check the transform method
-    km = estimator(n_clusters=n_clusters).fit(X)
+    km = Estimator(n_clusters=n_clusters).fit(X)
 
     # Transorfming cluster_centers_ should return the pairwise distances
     # between centers
@@ -656,20 +656,20 @@ def test_transform(estimator):
     assert_allclose(Xt, pairwise_distances(X, km.cluster_centers_))
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_fit_transform(estimator):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_fit_transform(Estimator):
     # Check equivalence between fit.transform and fit_transform
-    X1 = estimator(random_state=0, n_init=1).fit(X).transform(X)
-    X2 = estimator(random_state=0, n_init=1).fit_transform(X)
+    X1 = Estimator(random_state=0, n_init=1).fit(X).transform(X)
+    X2 = Estimator(random_state=0, n_init=1).fit_transform(X)
     assert_allclose(X1, X2)
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_sample_weight_unchanged(estimator):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_sample_weight_unchanged(Estimator):
     # Check that sample_weight is not modified in place by KMeans (#17204)
     X = np.array([[1], [2], [4]])
     sample_weight = np.array([0.5, 0.2, 0.3])
-    estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight)
+    Estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight)
 
     # internally, sample_weight is rescale to sum up to n_samples = 3
     assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3]))
@@ -850,13 +850,13 @@ def test_minibatch_kmeans_wrong_params(param, match):
         MiniBatchKMeans(**param).fit(X)
 
 
-@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans])
-def test_warnings(estimator):
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_warnings(Estimator):
     # Check warning messages common to KMeans and MiniBatchKMeans
     with pytest.warns(RuntimeWarning,
                       match="Explicit initial center position passed: "
                             "performing only one init"):
-        estimator(init=centers, n_clusters=n_clusters).fit(X)
+        Estimator(init=centers, n_clusters=n_clusters).fit(X)
 
 
 def test_kmeans_warnings():

From 153a06f79749838f52d162fd6fd02442c7617b28 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 17 Jul 2020 14:52:59 +0200
Subject: [PATCH 37/72] cln

---
 sklearn/cluster/tests/test_k_means.py | 328 +++++++++++++-------------
 1 file changed, 164 insertions(+), 164 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index c2a765735ebd5..b50fd79eabc33 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -44,20 +44,6 @@
 X_csr = sp.csr_matrix(X)
 
 
-def _check_fitted_model(km):
-    # check that the number of clusters centers and distinct labels match
-    # the expectation
-    centers = km.cluster_centers_
-    assert centers.shape == (n_clusters, n_features)
-
-    labels = km.labels_
-    assert np.unique(labels).shape[0] == n_clusters
-
-    # check that the labels assignment are perfect (up to a permutation)
-    assert_allclose(v_measure_score(true_labels, labels), 1.0)
-    assert km.inertia_ > 0.0
-
-
 @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
                          ids=["dense", "sparse"])
 @pytest.mark.parametrize("algo", ["full", "elkan"])
@@ -245,6 +231,20 @@ def test_minibatch_update_consistency():
     assert_allclose(new_inertia, new_inertia_csr)
 
 
+def _check_fitted_model(km):
+    # check that the number of clusters centers and distinct labels match
+    # the expectation
+    centers = km.cluster_centers_
+    assert centers.shape == (n_clusters, n_features)
+
+    labels = km.labels_
+    assert np.unique(labels).shape[0] == n_clusters
+
+    # check that the labels assignment are perfect (up to a permutation)
+    assert_allclose(v_measure_score(true_labels, labels), 1.0)
+    assert km.inertia_ > 0.0
+
+
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 @pytest.mark.parametrize("init", ["random", "k-means++", centers,
                                   lambda X, k, random_state: centers],
@@ -473,10 +473,6 @@ def test_predict(Estimator, init, dtype, array_constr):
     assert_allclose(v_measure_score(pred, np.arange(10)), 1)
 
 
-def _sort_centers(centers):
-    return np.sort(centers, axis=0)
-
-
 @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
 def test_dense_sparse(Estimator):
     # Check that the results are the same for dense and sparse input.
@@ -490,6 +486,80 @@ def test_dense_sparse(Estimator):
     assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_)
 
 
+@pytest.mark.parametrize("init", ["random", "k-means++", centers],
+                         ids=["random", "k-means++", "ndarray"])
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_predict_dense_sparse(Estimator, init):
+    # check that models trained on sparse input also works for dense input at
+    # predict time and vice versa.
+    n_init = 10 if type(init) is str else 1
+    km = Estimator(n_clusters=n_clusters, init=init, n_init=n_init,
+                   random_state=0)
+
+    km.fit(X_csr)
+    assert_array_equal(km.predict(X), km.labels_)
+
+    km.fit(X)
+    assert_array_equal(km.predict(X_csr), km.labels_)
+
+
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("dtype", [np.int32, np.int64])
+@pytest.mark.parametrize("init", ["k-means++", "ndarray"])
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_integer_input(Estimator, array_constr, dtype, init):
+    # Check that KMeans and MiniBatchKMeans work with integer input.
+    X_dense = np.array([[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]])
+    X = array_constr(X_dense, dtype=dtype)
+
+    n_init = 1 if init == "ndarray" else 10
+    init = X_dense[:2] if init == "ndarray" else init
+
+    km = Estimator(n_clusters=2, init=init, n_init=n_init, random_state=0)
+    if Estimator is MiniBatchKMeans:
+        km.set_params(batch_size=2)
+
+    km.fit(X)
+
+    # Internally integer input should be converted to float64
+    assert km.cluster_centers_.dtype == np.float64
+
+    expected_labels = [0, 1, 1, 0, 0, 1]
+    assert_allclose(v_measure_score(km.labels_, expected_labels), 1)
+
+    # Same with partial_fit (#14314)
+    if Estimator is MiniBatchKMeans:
+        km = clone(km).partial_fit(X)
+        assert km.cluster_centers_.dtype == np.float64
+
+
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_transform(Estimator):
+    # Check the transform method
+    km = Estimator(n_clusters=n_clusters).fit(X)
+
+    # Transorfming cluster_centers_ should return the pairwise distances
+    # between centers
+    Xt = km.transform(km.cluster_centers_)
+    assert_allclose(Xt, pairwise_distances(km.cluster_centers_))
+    # In particular, diagonal must be 0
+    assert_array_equal(Xt.diagonal(), np.zeros(n_clusters))
+
+    # Transorfming X should return the pairwise distances between X and the
+    # centers
+    Xt = km.transform(X)
+    assert_allclose(Xt, pairwise_distances(X, km.cluster_centers_))
+
+
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_fit_transform(Estimator):
+    # Check equivalence between fit.transform and fit_transform
+    X1 = Estimator(random_state=0, n_init=1).fit(X).transform(X)
+    X2 = Estimator(random_state=0, n_init=1).fit_transform(X)
+    assert_allclose(X1, X2)
+
+
 def test_k_means_function():
     # test calling the k_means function directly
     cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
@@ -554,6 +624,17 @@ def test_centers_not_mutated(Estimator, dtype):
     assert not np.may_share_memory(km.cluster_centers_, centers_new_type)
 
 
+@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
+def test_kmeans_init_fitted_centers(data):
+    # Check that starting fitting from a local optimum shouldn't change the
+    # solution
+    km1 = KMeans(n_clusters=n_clusters).fit(data)
+    km2 = KMeans(n_clusters=n_clusters, init=km1.cluster_centers_,
+                 n_init=1).fit(data)
+
+    assert_allclose(km1.cluster_centers_, km2.cluster_centers_)
+
+
 def test_kmeans_warns_less_centers_than_unique_points():
     # Check KMeans when the number of found clusters is smaller than expected
     X = np.asarray([[0, 0],
@@ -572,6 +653,10 @@ def test_kmeans_warns_less_centers_than_unique_points():
         assert set(km.labels_) == set(range(3))
 
 
+def _sort_centers(centers):
+    return np.sort(centers, axis=0)
+
+
 def test_weighted_vs_repeated():
     # Check that a sample weight of N should yield the same result as an N-fold
     # repetition of the sample. Valid only if init is precomputed, otherwise
@@ -622,70 +707,69 @@ def test_scaled_weights(Estimator, data):
     assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_)
 
 
-@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
-                         ids=["dense", "sparse"])
-@pytest.mark.parametrize("dtype", [np.int32, np.int64])
-@pytest.mark.parametrize("init", ["k-means++", "ndarray"])
-@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
-def test_integer_input(Estimator, array_constr, dtype, init):
-    # Check that KMeans and MiniBatchKMeans work with integer input.
-    X_dense = np.array([[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]])
-    X = array_constr(X_dense, dtype=dtype)
-
-    n_init = 1 if init == "ndarray" else 10
-    init = X_dense[:2] if init == "ndarray" else init
-
-    km = Estimator(n_clusters=2, init=init, n_init=n_init, random_state=0)
-    if Estimator is MiniBatchKMeans:
-        km.set_params(batch_size=2)
+def test_kmeans_elkan_iter_attribute():
+    # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off
+    # it's right value (#11340).
+    km = KMeans(algorithm="elkan", max_iter=1).fit(X)
+    assert km.n_iter_ == 1
 
-    km.fit(X)
 
-    # Internally integer input should be converted to float64
-    assert km.cluster_centers_.dtype == np.float64
+@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
+                         ids=["dense", "sparse"])
+def test_kmeans_empty_cluster_relocated(array_constr):
+    # check that empty clusters are correctly relocated when using sample
+    # weights (#13486)
+    X = array_constr([[-1], [1]])
+    sample_weight = [1.9, 0.1]
+    init = np.array([[-1], [10]])
 
-    expected_labels = [0, 1, 1, 0, 0, 1]
-    assert_allclose(v_measure_score(km.labels_, expected_labels), 1)
+    km = KMeans(n_clusters=2, init=init, n_init=1)
+    km.fit(X, sample_weight=sample_weight)
 
-    # Same with partial_fit (#14314)
-    if Estimator is MiniBatchKMeans:
-        km = clone(km).partial_fit(X)
-        assert km.cluster_centers_.dtype == np.float64
+    assert len(set(km.labels_)) == 2
+    assert_allclose(km.cluster_centers_, [[-1], [1]])
 
 
-@pytest.mark.parametrize("init", ["random", "k-means++", centers],
-                         ids=["random", "k-means++", "ndarray"])
 @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
-def test_predict_dense_sparse(Estimator, init):
-    # check that models trained on sparse input also works for dense input at
-    # predict time and vice versa.
-    n_init = 10 if type(init) is str else 1
-    km = Estimator(n_clusters=n_clusters, init=init, n_init=n_init,
-                   random_state=0)
+def test_result_equal_in_diff_n_threads(Estimator):
+    # Check that KMeans/MiniBatchKMeans give the same results in parallel mode
+    # than in sequential mode.
+    rnd = np.random.RandomState(0)
+    X = rnd.normal(size=(50, 10))
 
-    km.fit(X_csr)
-    assert_array_equal(km.predict(X), km.labels_)
+    with threadpool_limits(limits=1, user_api="openmp"):
+        result_1 = Estimator(
+            n_clusters=n_clusters, random_state=0).fit(X).labels_
+    with threadpool_limits(limits=2, user_api="openmp"):
+        result_2 = Estimator(
+            n_clusters=n_clusters, random_state=0).fit(X).labels_
+    assert_array_equal(result_1, result_2)
 
-    km.fit(X)
-    assert_array_equal(km.predict(X_csr), km.labels_)
 
+@pytest.mark.parametrize("precompute_distances", ["auto", False, True])
+def test_precompute_distance_deprecated(precompute_distances):
+    # FIXME: remove in 0.25
+    depr_msg = ("'precompute_distances' was deprecated in version 0.23 and "
+                "will be removed in 0.25.")
+    X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
+    kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0,
+                    precompute_distances=precompute_distances)
 
-@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
-def test_transform(Estimator):
-    # Check the transform method
-    km = Estimator(n_clusters=n_clusters).fit(X)
+    with pytest.warns(FutureWarning, match=depr_msg):
+        kmeans.fit(X)
 
-    # Transorfming cluster_centers_ should return the pairwise distances
-    # between centers
-    Xt = km.transform(km.cluster_centers_)
-    assert_allclose(Xt, pairwise_distances(km.cluster_centers_))
-    # In particular, diagonal must be 0
-    assert_array_equal(Xt.diagonal(), np.zeros(n_clusters))
 
-    # Transorfming X should return the pairwise distances between X and the
-    # centers
-    Xt = km.transform(X)
-    assert_allclose(Xt, pairwise_distances(X, km.cluster_centers_))
+@pytest.mark.parametrize("n_jobs", [None, 1])
+def test_n_jobs_deprecated(n_jobs):
+    # FIXME: remove in 0.25
+    depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed "
+                "in 0.25.")
+    X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
+    kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0,
+                    n_jobs=n_jobs)
+
+    with pytest.warns(FutureWarning, match=depr_msg):
+        kmeans.fit(X)
 
 
 @pytest.mark.parametrize("attr", ["counts_", "init_size_", "random_state_"])
@@ -709,25 +793,6 @@ def test_warning_elkan_1_cluster():
         KMeans(n_clusters=1, algorithm="elkan").fit(X)
 
 
-@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
-def test_fit_transform(Estimator):
-    # Check equivalence between fit.transform and fit_transform
-    X1 = Estimator(random_state=0, n_init=1).fit(X).transform(X)
-    X2 = Estimator(random_state=0, n_init=1).fit_transform(X)
-    assert_allclose(X1, X2)
-
-
-@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
-def test_sample_weight_unchanged(Estimator):
-    # Check that sample_weight is not modified in place by KMeans (#17204)
-    X = np.array([[1], [2], [4]])
-    sample_weight = np.array([0.5, 0.2, 0.3])
-    Estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight)
-
-    # internally, sample_weight is rescale to sum up to n_samples = 3
-    assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3]))
-
-
 @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
                          ids=["dense", "sparse"])
 @pytest.mark.parametrize("algo", ["full", "elkan"])
@@ -757,40 +822,6 @@ def py_kmeans(X, init):
     assert_allclose(py_centers, cy_centers)
 
 
-@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
-def test_kmeans_init_fitted_centers(data):
-    # Check that starting fitting from a local optimum shouldn't change the
-    # solution
-    km1 = KMeans(n_clusters=n_clusters).fit(data)
-    km2 = KMeans(n_clusters=n_clusters, init=km1.cluster_centers_,
-                 n_init=1).fit(data)
-
-    assert_allclose(km1.cluster_centers_, km2.cluster_centers_)
-
-
-def test_kmeans_elkan_iter_attribute():
-    # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off
-    # it's right value (#11340).
-    km = KMeans(algorithm="elkan", max_iter=1).fit(X)
-    assert km.n_iter_ == 1
-
-
-@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
-                         ids=["dense", "sparse"])
-def test_kmeans_empty_cluster_relocated(array_constr):
-    # check that empty clusters are correctly relocated when using sample
-    # weights (#13486)
-    X = array_constr([[-1], [1]])
-    sample_weight = [1.9, 0.1]
-    init = np.array([[-1], [10]])
-
-    km = KMeans(n_clusters=2, init=init, n_init=1)
-    km.fit(X, sample_weight=sample_weight)
-
-    assert len(set(km.labels_)) == 2
-    assert_allclose(km.cluster_centers_, [[-1], [1]])
-
-
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("squared", [True, False])
 def test_euclidean_distance(dtype, squared):
@@ -839,6 +870,17 @@ def test_inertia(dtype):
     assert_allclose(inertia_sparse, expected, rtol=1e-6)
 
 
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_sample_weight_unchanged(Estimator):
+    # Check that sample_weight is not modified in place by KMeans (#17204)
+    X = np.array([[1], [2], [4]])
+    sample_weight = np.array([0.5, 0.2, 0.3])
+    Estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight)
+
+    # internally, sample_weight is rescale to sum up to n_samples = 3
+    assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3]))
+
+
 @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
 @pytest.mark.parametrize("param, match", [
     ({"n_init": 0}, r"n_init should be > 0"),
@@ -888,45 +930,3 @@ def test_minibatch_kmeans_wrong_params(param, match):
     # are passed for the MiniBatchKMeans specific parameters
     with pytest.raises(ValueError, match=match):
         MiniBatchKMeans(**param).fit(X)
-
-
-@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
-def test_result_equal_in_diff_n_threads(Estimator):
-    # Check that KMeans/MiniBatchKMeans give the same results in parallel mode
-    # than in sequential mode.
-    rnd = np.random.RandomState(0)
-    X = rnd.normal(size=(50, 10))
-
-    with threadpool_limits(limits=1, user_api="openmp"):
-        result_1 = Estimator(
-            n_clusters=n_clusters, random_state=0).fit(X).labels_
-    with threadpool_limits(limits=2, user_api="openmp"):
-        result_2 = Estimator(
-            n_clusters=n_clusters, random_state=0).fit(X).labels_
-    assert_array_equal(result_1, result_2)
-
-
-@pytest.mark.parametrize("precompute_distances", ["auto", False, True])
-def test_precompute_distance_deprecated(precompute_distances):
-    # FIXME: remove in 0.25
-    depr_msg = ("'precompute_distances' was deprecated in version 0.23 and "
-                "will be removed in 0.25.")
-    X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
-    kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0,
-                    precompute_distances=precompute_distances)
-
-    with pytest.warns(FutureWarning, match=depr_msg):
-        kmeans.fit(X)
-
-
-@pytest.mark.parametrize("n_jobs", [None, 1])
-def test_n_jobs_deprecated(n_jobs):
-    # FIXME: remove in 0.25
-    depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed "
-                "in 0.25.")
-    X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
-    kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0,
-                    n_jobs=n_jobs)
-
-    with pytest.warns(FutureWarning, match=depr_msg):
-        kmeans.fit(X)

From d2d68322b2af6c251503c3aec9d9c9b321e7ba4d Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 17 Jul 2020 15:35:32 +0200
Subject: [PATCH 38/72] cln

---
 sklearn/cluster/tests/test_k_means.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index b50fd79eabc33..93eb1a9679c33 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -19,8 +19,8 @@
 from sklearn.metrics.cluster import v_measure_score
 from sklearn.cluster import KMeans, k_means
 from sklearn.cluster import MiniBatchKMeans
-from sklearn.cluster._kmeans import _mini_batch_step
 from sklearn.cluster._kmeans import _labels_inertia
+from sklearn.cluster._kmeans import _mini_batch_step
 from sklearn.cluster._k_means_common import _relocate_empty_clusters_dense
 from sklearn.cluster._k_means_common import _relocate_empty_clusters_sparse
 from sklearn.cluster._k_means_common import _euclidean_dense_dense_wrapper
@@ -70,8 +70,8 @@ def test_kmeans_results(array_constr, algo, dtype):
 
 
 @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix],
-                         ids=["dense", "sparse"])
-@pytest.mark.parametrize("algo", ["full", "elkan"])
+                         ids=['dense', 'sparse'])
+@pytest.mark.parametrize("algo", ['full', 'elkan'])
 def test_kmeans_relocated_clusters(array_constr, algo):
     # check that empty clusters are relocated as expected
     X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]])
@@ -560,6 +560,17 @@ def test_fit_transform(Estimator):
     assert_allclose(X1, X2)
 
 
+def test_n_init():
+    # Check that increasing the number of init increases the quality
+    previous_inertia = np.inf
+    for n_init in [1, 5, 10]:
+        # set max_iter=1 to avoid finding the global minimum and get the same
+        # inertia each time
+        km = KMeans(n_clusters=n_clusters, init="random", n_init=n_init,
+                    random_state=0, max_iter=1).fit(X)
+        assert km.inertia_ <= previous_inertia
+
+
 def test_k_means_function():
     # test calling the k_means function directly
     cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
@@ -752,7 +763,7 @@ def test_precompute_distance_deprecated(precompute_distances):
     depr_msg = ("'precompute_distances' was deprecated in version 0.23 and "
                 "will be removed in 0.25.")
     X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
-    kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0,
+    kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0,
                     precompute_distances=precompute_distances)
 
     with pytest.warns(FutureWarning, match=depr_msg):
@@ -765,7 +776,7 @@ def test_n_jobs_deprecated(n_jobs):
     depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed "
                 "in 0.25.")
     X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
-    kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0,
+    kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0,
                     n_jobs=n_jobs)
 
     with pytest.warns(FutureWarning, match=depr_msg):
@@ -877,7 +888,6 @@ def test_sample_weight_unchanged(Estimator):
     sample_weight = np.array([0.5, 0.2, 0.3])
     Estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight)
 
-    # internally, sample_weight is rescale to sum up to n_samples = 3
     assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3]))
 
 

From 690f5b9ed37a6bf3731c72edc8ccbf5c685176eb Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 17 Jul 2020 15:40:59 +0200
Subject: [PATCH 39/72] cln

---
 sklearn/cluster/_kmeans.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 8b0bbe257c08a..ad0c5b49eb6b4 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -548,7 +548,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
         The resulting assignment.
 
     inertia : float
-        Sum of squared distances of samples to their closest cluster center
+        Sum of squared distances of samples to their closest cluster center.
     """
     n_samples = X.shape[0]
     n_clusters = centers.shape[0]
@@ -950,7 +950,7 @@ def fit(self, X, y=None, sample_weight=None):
             # The copy was already done above
             X -= X_mean
 
-            if hasattr(self.init, '__array__'):
+            if hasattr(init, '__array__'):
                 init -= X_mean
 
         # precompute squared norms of data points

From 158aeed9806d7170f5cc5a464c73a04988392764 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Wed, 22 Jul 2020 12:32:32 +0200
Subject: [PATCH 40/72] cln

---
 sklearn/cluster/_kmeans.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index ad0c5b49eb6b4..04d323f2258ac 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -32,8 +32,8 @@
 from ..exceptions import ConvergenceWarning
 from ._k_means_common import _inertia_dense
 from ._k_means_common import _inertia_sparse
-from ._k_means_minibatch import _minibatch_update_sparse
 from ._k_means_minibatch import _minibatch_update_dense
+from ._k_means_minibatch import _minibatch_update_sparse
 from ._k_means_lloyd import lloyd_iter_chunked_dense
 from ._k_means_lloyd import lloyd_iter_chunked_sparse
 from ._k_means_elkan import init_bounds_dense
@@ -45,8 +45,7 @@
 ###############################################################################
 # Initialization heuristic
 
-def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state,
-                     n_local_trials=None):
+def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
     """Init n_clusters seeds according to k-means++
 
     Parameters
@@ -884,9 +883,8 @@ def _init_centroids(self, X, x_squared_norms, init, random_state,
             n_samples = X.shape[0]
 
         if isinstance(init, str) and init == 'k-means++':
-            centers = _kmeans_plusplus(X, n_clusters,
-                                       random_state=random_state,
-                                       x_squared_norms=x_squared_norms)
+            centers = _k_init(X, n_clusters, random_state=random_state,
+                              x_squared_norms=x_squared_norms)
         elif isinstance(init, str) and init == 'random':
             seeds = random_state.permutation(n_samples)[:n_clusters]
             centers = X[seeds]
@@ -1350,9 +1348,6 @@ class MiniBatchKMeans(KMeans):
         defined as the sum of square distances of samples to their cluster
         center, weighted by the sample weights if provided.
 
-    n_iter_ : int
-        Number of iterations run.
-
     n_iter_ : int
         Number of batches processed.
 

From e11cedb89be63ed77478610386fcff5754eea0d6 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Thu, 23 Jul 2020 13:23:27 +0200
Subject: [PATCH 41/72] wip

---
 sklearn/cluster/_k_means_minibatch.pyx | 43 ++++++++++------------
 sklearn/cluster/_kmeans.py             | 50 ++++----------------------
 2 files changed, 25 insertions(+), 68 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index 49af1c7426d0a..60d10c47c320b 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -7,9 +7,7 @@
 cimport numpy as np
 from cython cimport floating
 from cython.parallel cimport parallel, prange
-from libc.math cimport sqrt
 from libc.stdlib cimport malloc, free
-from libc.string cimport memcpy
 
 
 np.import_array()
@@ -53,22 +51,23 @@ def _minibatch_update_dense(
     cdef:
         int n_samples = X.shape[0]
         int n_clusters = centers_old.shape[0]
-        int i
+        int cluster_idx
 
         int *indices
 
     with nogil, parallel(num_threads=n_threads):
         indices = <int*> malloc(n_samples * sizeof(int))
 
-        for i in prange(n_clusters, schedule="static"):
-            update_center_dense(i, &X[0, 0], sample_weight, centers_old,
-                                centers_new, weight_sums, labels, indices)
+        for cluster_idx in prange(n_clusters, schedule="static"):
+            update_center_dense(cluster_idx, &X[0, 0], sample_weight,
+                                centers_old, centers_new, weight_sums, labels,
+                                indices)
 
         free(indices)
 
 
 cdef void update_center_dense(
-        int i,
+        int cluster_idx,
         floating *X,                   # IN
         floating[::1] sample_weight,   # IN
         floating[:, ::1] centers_old,  # IN
@@ -89,36 +88,34 @@ cdef void update_center_dense(
     # indices = np.where(labels == i)[0]
     k = 0
     for j in range(n_samples):
-        if labels[j] == i:
+        if labels[j] == cluster_idx:
             indices[k] = j
+            wsum += sample_weight[j]
             k += 1
     n_indices = k
 
-    for j in range(n_indices):
-        idx = indices[j]
-        wsum += sample_weight[idx]
-
     if wsum > 0:
-        # Remove previous count scaling
+        # Undo the previous count-based scaling for this cluster center
         for k in range(n_features):
-            centers_new[i, k] = centers_old[i, k] * weight_sums[i]
+            centers_new[cluster_idx, k] = centers_old[cluster_idx, k] * weight_sums[cluster_idx]
 
         # Update cluster with new point members
         for j in range(n_indices):
             idx = indices[j]
             for k in range(n_features):
-                centers_new[i, k] += X[idx * n_features + k] * sample_weight[idx]
+                centers_new[cluster_idx, k] += X[idx * n_features + k] * sample_weight[idx]
 
         # Update the count statistics for this center
-        weight_sums[i] += wsum
+        weight_sums[cluster_idx] += wsum
 
         # Rescale to compute mean of all points (old and new)
-        alpha = 1 / weight_sums[i]
+        alpha = 1 / weight_sums[cluster_idx]
         for k in range(n_features):
-            centers_new[i, k] *= alpha
+            centers_new[cluster_idx, k] *= alpha
     else:
+        # No sample was assigned to this cluster in this batch of data
         for k in range(n_features):
-            centers_new[i, k] = centers_old[i, k]
+            centers_new[cluster_idx, k] = centers_old[cluster_idx, k]
 
 
 def _minibatch_update_sparse(
@@ -203,15 +200,12 @@ cdef void update_center_sparse(
     for j in range(n_samples):
         if labels[j] == i:
             indices[k] = j
+            wsum += sample_weight[j]
             k += 1
     n_indices = k
 
-    for j in range(n_indices):
-        idx = indices[j]
-        wsum += sample_weight[idx]
-
     if wsum > 0:
-        # Remove previous count scaling
+        # Undo the previous count-based scaling for this cluster center:
         for k in range(n_features):
             centers_new[i, k] = centers_old[i, k] * weight_sums[i]
 
@@ -229,5 +223,6 @@ cdef void update_center_sparse(
         for k in range(n_features):
             centers_new[i, k] *= alpha
     else:
+        # No sample was assigned to this cluster in this batch of data
         for k in range(n_features):
             centers_new[i, k] = centers_old[i, k]
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 04d323f2258ac..b85d2a34a314e 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -24,7 +24,6 @@
 from ..utils.sparsefuncs import mean_variance_axis
 from ..utils.validation import _deprecate_positional_args
 from ..utils import check_array
-from ..utils import gen_batches
 from ..utils import check_random_state
 from ..utils import deprecated
 from ..utils.validation import check_is_fitted, _check_sample_weight
@@ -1476,45 +1475,6 @@ def _check_params(self, X):
                 f"reassignment_ratio should be >= 0, got "
                 f"{self.reassignment_ratio} instead.")
 
-    def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms,
-                                  centers):
-        """Compute labels and inertia using mini batches.
-
-        This is slightly slower than doing everything at once but preventes
-        memory errors / segfaults.
-
-        Parameters
-        ----------
-        X : ndarray of shape (n_samples, n_features)
-            Input data.
-
-        sample_weight : ndarray of shape (n_samples,)
-            The weights for each observation in X.
-
-        x_squared_norms : ndarray of shape (n_samples,)
-            Precomputed squared euclidean norm of each data point, to speed up
-            computations.
-
-        centers : ndarray of shape (n_clusters, n_features)
-            The cluster centers.
-
-        Returns
-        -------
-        labels : ndarray of shape (n_samples,)
-            Cluster labels for each point.
-
-        inertia : float
-            Sum of squared distances of points to nearest cluster.
-        """
-        if self.verbose:
-            print('Computing label assignment and total inertia')
-        slices = gen_batches(X.shape[0], self.batch_size)
-        results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s],
-                                   centers, n_threads=self._n_threads)
-                   for s in slices]
-        labels, inertia = zip(*results)
-        return np.hstack(labels), np.sum(inertia)
-
     def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
                                 centers_squared_diff, batch_inertia):
         """Helper function to encapsulate the early stopping logic"""
@@ -1708,8 +1668,9 @@ def fit(self, X, y=None, sample_weight=None):
         self.n_iter_ = i + 1
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = self._labels_inertia_minibatch(
-                X, sample_weight, x_squared_norms, self.cluster_centers_)
+            self.labels_, self.inertia_ = _labels_inertia(
+                X, sample_weight, x_squared_norms, self.cluster_centers_,
+                n_threads=self._n_threads)
 
         return self
 
@@ -1819,8 +1780,9 @@ def predict(self, X, sample_weight=None):
         x_squared_norms = row_norms(X, squared=True)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        return self._labels_inertia_minibatch(
-            X, sample_weight, x_squared_norms, self.cluster_centers_)[0]
+        return _labels_inertia(
+            X, sample_weight, x_squared_norms, self.cluster_centers_,
+            n_threads=self._n_threads)[0]
 
     def _more_tags(self):
         return {

From bbdabf540adf85994ed4b7b9434bc9db0a0020d9 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 24 Jul 2020 14:06:15 +0200
Subject: [PATCH 42/72] threadpool-limit protection

---
 sklearn/cluster/_kmeans.py | 131 +++++++++++++++++++++----------------
 1 file changed, 74 insertions(+), 57 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index b85d2a34a314e..e140549ed99ff 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -573,6 +573,16 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
     return labels, inertia
 
 
+def _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms,
+                                     centers, n_threads=None):
+    """Same as _labels_inertia but in a threadpool_limits context."""
+    with threadpool_limits(limits=1, user_api="blas"):
+        labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms,
+                                          centers, n_threads)
+
+    return labels, inertia
+
+
 class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     """K-Means clustering.
 
@@ -1102,8 +1112,9 @@ def predict(self, X, sample_weight=None):
         x_squared_norms = row_norms(X, squared=True)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        return _labels_inertia(X, sample_weight, x_squared_norms,
-                               self.cluster_centers_, self._n_threads)[0]
+        return _labels_inertia_threadpool_limit(
+            X, sample_weight, x_squared_norms, self.cluster_centers_,
+            self._n_threads)[0]
 
     def score(self, X, y=None, sample_weight=None):
         """Opposite of the value of X on the K-means objective.
@@ -1131,8 +1142,9 @@ def score(self, X, y=None, sample_weight=None):
         x_squared_norms = row_norms(X, squared=True)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        return -_labels_inertia(X, sample_weight, x_squared_norms,
-                                self.cluster_centers_, self._n_threads)[1]
+        return -_labels_inertia_threadpool_limit(
+            X, sample_weight, x_squared_norms, self.cluster_centers_,
+            self._n_threads)[1]
 
     def _more_tags(self):
         return {
@@ -1600,7 +1612,7 @@ def fit(self, X, y=None, sample_weight=None):
                 random_state=random_state, init_size=self._init_size)
 
             # Compute inertia on a validation set.
-            _, inertia = _labels_inertia(
+            _, inertia = _labels_inertia_threadpool_limit(
                 X_valid, sample_weight_valid, x_squared_norms_valid,
                 cluster_centers, n_threads=self._n_threads)
 
@@ -1625,50 +1637,52 @@ def fit(self, X, y=None, sample_weight=None):
         n_batches = int(np.ceil(float(n_samples) / self.batch_size))
         n_iter = int(self.max_iter * n_batches)
 
-        # Perform the iterative optimization until convergence
-        for i in range(n_iter):
-            # Sample a minibatch from the full dataset
-            minibatch_indices = random_state.randint(0, n_samples,
-                                                     self.batch_size)
-
-            # Randomly choose whether to perform random reassignment:
-            # the choice is done as a function of the iteration index, and the
-            # minimum number of counts, in order to force this reassignment to
-            # happen every once in a while.
-            random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0
-
-            # Perform the actual update step on the minibatch data
-            batch_inertia = _mini_batch_step(
-                X=X[minibatch_indices],
-                x_squared_norms=x_squared_norms[minibatch_indices],
-                sample_weight=sample_weight[minibatch_indices],
-                centers=centers,
-                centers_new=centers_new,
-                weight_sums=self._counts,
-                random_state=random_state,
-                random_reassign=random_reassign,
-                reassignment_ratio=self.reassignment_ratio,
-                verbose=self.verbose,
-                n_threads=self._n_threads)
-
-            if self._tol > 0.0:
-                centers_squared_diff = np.sum((centers_new - centers)**2)
-            else:
-                centers_squared_diff = 0
-
-            centers, centers_new = centers_new, centers
-
-            # Monitor convergence and do early stopping if necessary
-            if self._mini_batch_convergence(
-                    i, n_iter, n_samples, centers_squared_diff, batch_inertia):
-                break
+        with threadpool_limits(limits=1, user_api="blas"):
+            # Perform the iterative optimization until convergence
+            for i in range(n_iter):
+                # Sample a minibatch from the full dataset
+                minibatch_indices = random_state.randint(0, n_samples,
+                                                         self.batch_size)
+
+                # Randomly choose whether to perform random reassignment:
+                # the choice is done as a function of the iteration index, and
+                # the minimum number of counts, in order to force this
+                # reassignment to happen every once in a while.
+                random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0
+
+                # Perform the actual update step on the minibatch data
+                batch_inertia = _mini_batch_step(
+                    X=X[minibatch_indices],
+                    x_squared_norms=x_squared_norms[minibatch_indices],
+                    sample_weight=sample_weight[minibatch_indices],
+                    centers=centers,
+                    centers_new=centers_new,
+                    weight_sums=self._counts,
+                    random_state=random_state,
+                    random_reassign=random_reassign,
+                    reassignment_ratio=self.reassignment_ratio,
+                    verbose=self.verbose,
+                    n_threads=self._n_threads)
+
+                if self._tol > 0.0:
+                    centers_squared_diff = np.sum((centers_new - centers)**2)
+                else:
+                    centers_squared_diff = 0
+
+                centers, centers_new = centers_new, centers
+
+                # Monitor convergence and do early stopping if necessary
+                if self._mini_batch_convergence(
+                        i, n_iter, n_samples, centers_squared_diff,
+                        batch_inertia):
+                    break
 
         self.cluster_centers_ = centers
 
         self.n_iter_ = i + 1
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = _labels_inertia(
+            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
                 X, sample_weight, x_squared_norms, self.cluster_centers_,
                 n_threads=self._n_threads)
 
@@ -1734,20 +1748,21 @@ def partial_fit(self, X, y=None, sample_weight=None):
             random_reassign = self._random_state.randint(
                 10 * (1 + self._counts.min())) == 0
 
-        _mini_batch_step(X,
-                         x_squared_norms=x_squared_norms,
-                         sample_weight=sample_weight,
-                         centers=self.cluster_centers_,
-                         centers_new=self.cluster_centers_,
-                         weight_sums=self._counts,
-                         random_state=self._random_state,
-                         random_reassign=random_reassign,
-                         reassignment_ratio=self.reassignment_ratio,
-                         verbose=self.verbose,
-                         n_threads=self._n_threads)
+        with threadpool_limits(limits=1, user_api="blas"):
+            _mini_batch_step(X,
+                             x_squared_norms=x_squared_norms,
+                             sample_weight=sample_weight,
+                             centers=self.cluster_centers_,
+                             centers_new=self.cluster_centers_,
+                             weight_sums=self._counts,
+                             random_state=self._random_state,
+                             random_reassign=random_reassign,
+                             reassignment_ratio=self.reassignment_ratio,
+                             verbose=self.verbose,
+                             n_threads=self._n_threads)
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = _labels_inertia(
+            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
                 X, sample_weight, x_squared_norms, self.cluster_centers_,
                 n_threads=self._n_threads)
 
@@ -1780,9 +1795,11 @@ def predict(self, X, sample_weight=None):
         x_squared_norms = row_norms(X, squared=True)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        return _labels_inertia(
+        labels, _ = _labels_inertia_threadpool_limit(
             X, sample_weight, x_squared_norms, self.cluster_centers_,
-            n_threads=self._n_threads)[0]
+            n_threads=self._n_threads)
+
+        return labels
 
     def _more_tags(self):
         return {

From 53691e454bd55fb01f96bf6e2df2274bbd3c28ce Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 24 Jul 2020 14:36:45 +0200
Subject: [PATCH 43/72] idx

---
 sklearn/cluster/_k_means_minibatch.pyx | 30 +++++++++++++-------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index 60d10c47c320b..6476336a4078b 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -81,41 +81,41 @@ cdef void update_center_dense(
         int n_features = centers_old.shape[1]
         floating alpha, tmp
         int n_indices
-        int j, k, idx
+        int k, sample_idx, feature_idx
 
         floating wsum = 0
 
     # indices = np.where(labels == i)[0]
     k = 0
-    for j in range(n_samples):
-        if labels[j] == cluster_idx:
-            indices[k] = j
-            wsum += sample_weight[j]
+    for sample_idx in range(n_samples):
+        if labels[sample_idx] == cluster_idx:
+            indices[k] = sample_idx
+            wsum += sample_weight[sample_idx]
             k += 1
     n_indices = k
 
     if wsum > 0:
         # Undo the previous count-based scaling for this cluster center
-        for k in range(n_features):
-            centers_new[cluster_idx, k] = centers_old[cluster_idx, k] * weight_sums[cluster_idx]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
 
         # Update cluster with new point members
-        for j in range(n_indices):
-            idx = indices[j]
-            for k in range(n_features):
-                centers_new[cluster_idx, k] += X[idx * n_features + k] * sample_weight[idx]
+        for k in range(n_indices):
+            sample_idx = indices[k]
+            for feature_idx in range(n_features):
+                centers_new[cluster_idx, feature_idx] += X[sample_idx * n_features + feature_idx] * sample_weight[sample_idx]
 
         # Update the count statistics for this center
         weight_sums[cluster_idx] += wsum
 
         # Rescale to compute mean of all points (old and new)
         alpha = 1 / weight_sums[cluster_idx]
-        for k in range(n_features):
-            centers_new[cluster_idx, k] *= alpha
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] *= alpha
     else:
         # No sample was assigned to this cluster in this batch of data
-        for k in range(n_features):
-            centers_new[cluster_idx, k] = centers_old[cluster_idx, k]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
 
 
 def _minibatch_update_sparse(

From 24a267fb8317ba55bf198ffc6dd2ba9c64e583fe Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 24 Jul 2020 14:49:21 +0200
Subject: [PATCH 44/72] random_reassign

---
 sklearn/cluster/_kmeans.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index e140549ed99ff..8c6f119c398bc 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1648,7 +1648,8 @@ def fit(self, X, y=None, sample_weight=None):
                 # the choice is done as a function of the iteration index, and
                 # the minimum number of counts, in order to force this
                 # reassignment to happen every once in a while.
-                random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0
+                random_reassign = random_state.randint(
+                    10 * (1 + self._counts.min())) == 0
 
                 # Perform the actual update step on the minibatch data
                 batch_inertia = _mini_batch_step(

From 146a93b96ee1a1a31e76451369978d7b752e3c93 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Thu, 23 Jul 2020 13:23:27 +0200
Subject: [PATCH 45/72] wip

---
 sklearn/cluster/_k_means_minibatch.pyx | 43 ++++++++++------------
 sklearn/cluster/_kmeans.py             | 50 ++++----------------------
 2 files changed, 25 insertions(+), 68 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index 49af1c7426d0a..60d10c47c320b 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -7,9 +7,7 @@
 cimport numpy as np
 from cython cimport floating
 from cython.parallel cimport parallel, prange
-from libc.math cimport sqrt
 from libc.stdlib cimport malloc, free
-from libc.string cimport memcpy
 
 
 np.import_array()
@@ -53,22 +51,23 @@ def _minibatch_update_dense(
     cdef:
         int n_samples = X.shape[0]
         int n_clusters = centers_old.shape[0]
-        int i
+        int cluster_idx
 
         int *indices
 
     with nogil, parallel(num_threads=n_threads):
         indices = <int*> malloc(n_samples * sizeof(int))
 
-        for i in prange(n_clusters, schedule="static"):
-            update_center_dense(i, &X[0, 0], sample_weight, centers_old,
-                                centers_new, weight_sums, labels, indices)
+        for cluster_idx in prange(n_clusters, schedule="static"):
+            update_center_dense(cluster_idx, &X[0, 0], sample_weight,
+                                centers_old, centers_new, weight_sums, labels,
+                                indices)
 
         free(indices)
 
 
 cdef void update_center_dense(
-        int i,
+        int cluster_idx,
         floating *X,                   # IN
         floating[::1] sample_weight,   # IN
         floating[:, ::1] centers_old,  # IN
@@ -89,36 +88,34 @@ cdef void update_center_dense(
     # indices = np.where(labels == i)[0]
     k = 0
     for j in range(n_samples):
-        if labels[j] == i:
+        if labels[j] == cluster_idx:
             indices[k] = j
+            wsum += sample_weight[j]
             k += 1
     n_indices = k
 
-    for j in range(n_indices):
-        idx = indices[j]
-        wsum += sample_weight[idx]
-
     if wsum > 0:
-        # Remove previous count scaling
+        # Undo the previous count-based scaling for this cluster center
         for k in range(n_features):
-            centers_new[i, k] = centers_old[i, k] * weight_sums[i]
+            centers_new[cluster_idx, k] = centers_old[cluster_idx, k] * weight_sums[cluster_idx]
 
         # Update cluster with new point members
         for j in range(n_indices):
             idx = indices[j]
             for k in range(n_features):
-                centers_new[i, k] += X[idx * n_features + k] * sample_weight[idx]
+                centers_new[cluster_idx, k] += X[idx * n_features + k] * sample_weight[idx]
 
         # Update the count statistics for this center
-        weight_sums[i] += wsum
+        weight_sums[cluster_idx] += wsum
 
         # Rescale to compute mean of all points (old and new)
-        alpha = 1 / weight_sums[i]
+        alpha = 1 / weight_sums[cluster_idx]
         for k in range(n_features):
-            centers_new[i, k] *= alpha
+            centers_new[cluster_idx, k] *= alpha
     else:
+        # No sample was assigned to this cluster in this batch of data
         for k in range(n_features):
-            centers_new[i, k] = centers_old[i, k]
+            centers_new[cluster_idx, k] = centers_old[cluster_idx, k]
 
 
 def _minibatch_update_sparse(
@@ -203,15 +200,12 @@ cdef void update_center_sparse(
     for j in range(n_samples):
         if labels[j] == i:
             indices[k] = j
+            wsum += sample_weight[j]
             k += 1
     n_indices = k
 
-    for j in range(n_indices):
-        idx = indices[j]
-        wsum += sample_weight[idx]
-
     if wsum > 0:
-        # Remove previous count scaling
+        # Undo the previous count-based scaling for this cluster center:
         for k in range(n_features):
             centers_new[i, k] = centers_old[i, k] * weight_sums[i]
 
@@ -229,5 +223,6 @@ cdef void update_center_sparse(
         for k in range(n_features):
             centers_new[i, k] *= alpha
     else:
+        # No sample was assigned to this cluster in this batch of data
         for k in range(n_features):
             centers_new[i, k] = centers_old[i, k]
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 04d323f2258ac..b85d2a34a314e 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -24,7 +24,6 @@
 from ..utils.sparsefuncs import mean_variance_axis
 from ..utils.validation import _deprecate_positional_args
 from ..utils import check_array
-from ..utils import gen_batches
 from ..utils import check_random_state
 from ..utils import deprecated
 from ..utils.validation import check_is_fitted, _check_sample_weight
@@ -1476,45 +1475,6 @@ def _check_params(self, X):
                 f"reassignment_ratio should be >= 0, got "
                 f"{self.reassignment_ratio} instead.")
 
-    def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms,
-                                  centers):
-        """Compute labels and inertia using mini batches.
-
-        This is slightly slower than doing everything at once but preventes
-        memory errors / segfaults.
-
-        Parameters
-        ----------
-        X : ndarray of shape (n_samples, n_features)
-            Input data.
-
-        sample_weight : ndarray of shape (n_samples,)
-            The weights for each observation in X.
-
-        x_squared_norms : ndarray of shape (n_samples,)
-            Precomputed squared euclidean norm of each data point, to speed up
-            computations.
-
-        centers : ndarray of shape (n_clusters, n_features)
-            The cluster centers.
-
-        Returns
-        -------
-        labels : ndarray of shape (n_samples,)
-            Cluster labels for each point.
-
-        inertia : float
-            Sum of squared distances of points to nearest cluster.
-        """
-        if self.verbose:
-            print('Computing label assignment and total inertia')
-        slices = gen_batches(X.shape[0], self.batch_size)
-        results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s],
-                                   centers, n_threads=self._n_threads)
-                   for s in slices]
-        labels, inertia = zip(*results)
-        return np.hstack(labels), np.sum(inertia)
-
     def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
                                 centers_squared_diff, batch_inertia):
         """Helper function to encapsulate the early stopping logic"""
@@ -1708,8 +1668,9 @@ def fit(self, X, y=None, sample_weight=None):
         self.n_iter_ = i + 1
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = self._labels_inertia_minibatch(
-                X, sample_weight, x_squared_norms, self.cluster_centers_)
+            self.labels_, self.inertia_ = _labels_inertia(
+                X, sample_weight, x_squared_norms, self.cluster_centers_,
+                n_threads=self._n_threads)
 
         return self
 
@@ -1819,8 +1780,9 @@ def predict(self, X, sample_weight=None):
         x_squared_norms = row_norms(X, squared=True)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        return self._labels_inertia_minibatch(
-            X, sample_weight, x_squared_norms, self.cluster_centers_)[0]
+        return _labels_inertia(
+            X, sample_weight, x_squared_norms, self.cluster_centers_,
+            n_threads=self._n_threads)[0]
 
     def _more_tags(self):
         return {

From 412864f256a77d1366f6baa4c88d617cadc8f2d9 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 24 Jul 2020 14:06:15 +0200
Subject: [PATCH 46/72] threadpool-limit protection

---
 sklearn/cluster/_kmeans.py | 131 +++++++++++++++++++++----------------
 1 file changed, 74 insertions(+), 57 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index b85d2a34a314e..e140549ed99ff 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -573,6 +573,16 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
     return labels, inertia
 
 
+def _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms,
+                                     centers, n_threads=None):
+    """Same as _labels_inertia but in a threadpool_limits context."""
+    with threadpool_limits(limits=1, user_api="blas"):
+        labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms,
+                                          centers, n_threads)
+
+    return labels, inertia
+
+
 class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     """K-Means clustering.
 
@@ -1102,8 +1112,9 @@ def predict(self, X, sample_weight=None):
         x_squared_norms = row_norms(X, squared=True)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        return _labels_inertia(X, sample_weight, x_squared_norms,
-                               self.cluster_centers_, self._n_threads)[0]
+        return _labels_inertia_threadpool_limit(
+            X, sample_weight, x_squared_norms, self.cluster_centers_,
+            self._n_threads)[0]
 
     def score(self, X, y=None, sample_weight=None):
         """Opposite of the value of X on the K-means objective.
@@ -1131,8 +1142,9 @@ def score(self, X, y=None, sample_weight=None):
         x_squared_norms = row_norms(X, squared=True)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        return -_labels_inertia(X, sample_weight, x_squared_norms,
-                                self.cluster_centers_, self._n_threads)[1]
+        return -_labels_inertia_threadpool_limit(
+            X, sample_weight, x_squared_norms, self.cluster_centers_,
+            self._n_threads)[1]
 
     def _more_tags(self):
         return {
@@ -1600,7 +1612,7 @@ def fit(self, X, y=None, sample_weight=None):
                 random_state=random_state, init_size=self._init_size)
 
             # Compute inertia on a validation set.
-            _, inertia = _labels_inertia(
+            _, inertia = _labels_inertia_threadpool_limit(
                 X_valid, sample_weight_valid, x_squared_norms_valid,
                 cluster_centers, n_threads=self._n_threads)
 
@@ -1625,50 +1637,52 @@ def fit(self, X, y=None, sample_weight=None):
         n_batches = int(np.ceil(float(n_samples) / self.batch_size))
         n_iter = int(self.max_iter * n_batches)
 
-        # Perform the iterative optimization until convergence
-        for i in range(n_iter):
-            # Sample a minibatch from the full dataset
-            minibatch_indices = random_state.randint(0, n_samples,
-                                                     self.batch_size)
-
-            # Randomly choose whether to perform random reassignment:
-            # the choice is done as a function of the iteration index, and the
-            # minimum number of counts, in order to force this reassignment to
-            # happen every once in a while.
-            random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0
-
-            # Perform the actual update step on the minibatch data
-            batch_inertia = _mini_batch_step(
-                X=X[minibatch_indices],
-                x_squared_norms=x_squared_norms[minibatch_indices],
-                sample_weight=sample_weight[minibatch_indices],
-                centers=centers,
-                centers_new=centers_new,
-                weight_sums=self._counts,
-                random_state=random_state,
-                random_reassign=random_reassign,
-                reassignment_ratio=self.reassignment_ratio,
-                verbose=self.verbose,
-                n_threads=self._n_threads)
-
-            if self._tol > 0.0:
-                centers_squared_diff = np.sum((centers_new - centers)**2)
-            else:
-                centers_squared_diff = 0
-
-            centers, centers_new = centers_new, centers
-
-            # Monitor convergence and do early stopping if necessary
-            if self._mini_batch_convergence(
-                    i, n_iter, n_samples, centers_squared_diff, batch_inertia):
-                break
+        with threadpool_limits(limits=1, user_api="blas"):
+            # Perform the iterative optimization until convergence
+            for i in range(n_iter):
+                # Sample a minibatch from the full dataset
+                minibatch_indices = random_state.randint(0, n_samples,
+                                                         self.batch_size)
+
+                # Randomly choose whether to perform random reassignment:
+                # the choice is done as a function of the iteration index, and
+                # the minimum number of counts, in order to force this
+                # reassignment to happen every once in a while.
+                random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0
+
+                # Perform the actual update step on the minibatch data
+                batch_inertia = _mini_batch_step(
+                    X=X[minibatch_indices],
+                    x_squared_norms=x_squared_norms[minibatch_indices],
+                    sample_weight=sample_weight[minibatch_indices],
+                    centers=centers,
+                    centers_new=centers_new,
+                    weight_sums=self._counts,
+                    random_state=random_state,
+                    random_reassign=random_reassign,
+                    reassignment_ratio=self.reassignment_ratio,
+                    verbose=self.verbose,
+                    n_threads=self._n_threads)
+
+                if self._tol > 0.0:
+                    centers_squared_diff = np.sum((centers_new - centers)**2)
+                else:
+                    centers_squared_diff = 0
+
+                centers, centers_new = centers_new, centers
+
+                # Monitor convergence and do early stopping if necessary
+                if self._mini_batch_convergence(
+                        i, n_iter, n_samples, centers_squared_diff,
+                        batch_inertia):
+                    break
 
         self.cluster_centers_ = centers
 
         self.n_iter_ = i + 1
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = _labels_inertia(
+            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
                 X, sample_weight, x_squared_norms, self.cluster_centers_,
                 n_threads=self._n_threads)
 
@@ -1734,20 +1748,21 @@ def partial_fit(self, X, y=None, sample_weight=None):
             random_reassign = self._random_state.randint(
                 10 * (1 + self._counts.min())) == 0
 
-        _mini_batch_step(X,
-                         x_squared_norms=x_squared_norms,
-                         sample_weight=sample_weight,
-                         centers=self.cluster_centers_,
-                         centers_new=self.cluster_centers_,
-                         weight_sums=self._counts,
-                         random_state=self._random_state,
-                         random_reassign=random_reassign,
-                         reassignment_ratio=self.reassignment_ratio,
-                         verbose=self.verbose,
-                         n_threads=self._n_threads)
+        with threadpool_limits(limits=1, user_api="blas"):
+            _mini_batch_step(X,
+                             x_squared_norms=x_squared_norms,
+                             sample_weight=sample_weight,
+                             centers=self.cluster_centers_,
+                             centers_new=self.cluster_centers_,
+                             weight_sums=self._counts,
+                             random_state=self._random_state,
+                             random_reassign=random_reassign,
+                             reassignment_ratio=self.reassignment_ratio,
+                             verbose=self.verbose,
+                             n_threads=self._n_threads)
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = _labels_inertia(
+            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
                 X, sample_weight, x_squared_norms, self.cluster_centers_,
                 n_threads=self._n_threads)
 
@@ -1780,9 +1795,11 @@ def predict(self, X, sample_weight=None):
         x_squared_norms = row_norms(X, squared=True)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        return _labels_inertia(
+        labels, _ = _labels_inertia_threadpool_limit(
             X, sample_weight, x_squared_norms, self.cluster_centers_,
-            n_threads=self._n_threads)[0]
+            n_threads=self._n_threads)
+
+        return labels
 
     def _more_tags(self):
         return {

From 421a0410993cbce4a4e2c8d80813c5af05b90d59 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 24 Jul 2020 14:36:45 +0200
Subject: [PATCH 47/72] idx

---
 sklearn/cluster/_k_means_minibatch.pyx | 30 +++++++++++++-------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index 60d10c47c320b..6476336a4078b 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -81,41 +81,41 @@ cdef void update_center_dense(
         int n_features = centers_old.shape[1]
         floating alpha, tmp
         int n_indices
-        int j, k, idx
+        int k, sample_idx, feature_idx
 
         floating wsum = 0
 
     # indices = np.where(labels == i)[0]
     k = 0
-    for j in range(n_samples):
-        if labels[j] == cluster_idx:
-            indices[k] = j
-            wsum += sample_weight[j]
+    for sample_idx in range(n_samples):
+        if labels[sample_idx] == cluster_idx:
+            indices[k] = sample_idx
+            wsum += sample_weight[sample_idx]
             k += 1
     n_indices = k
 
     if wsum > 0:
         # Undo the previous count-based scaling for this cluster center
-        for k in range(n_features):
-            centers_new[cluster_idx, k] = centers_old[cluster_idx, k] * weight_sums[cluster_idx]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
 
         # Update cluster with new point members
-        for j in range(n_indices):
-            idx = indices[j]
-            for k in range(n_features):
-                centers_new[cluster_idx, k] += X[idx * n_features + k] * sample_weight[idx]
+        for k in range(n_indices):
+            sample_idx = indices[k]
+            for feature_idx in range(n_features):
+                centers_new[cluster_idx, feature_idx] += X[sample_idx * n_features + feature_idx] * sample_weight[sample_idx]
 
         # Update the count statistics for this center
         weight_sums[cluster_idx] += wsum
 
         # Rescale to compute mean of all points (old and new)
         alpha = 1 / weight_sums[cluster_idx]
-        for k in range(n_features):
-            centers_new[cluster_idx, k] *= alpha
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] *= alpha
     else:
         # No sample was assigned to this cluster in this batch of data
-        for k in range(n_features):
-            centers_new[cluster_idx, k] = centers_old[cluster_idx, k]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
 
 
 def _minibatch_update_sparse(

From a6862f8817d5fd7a08a05cfdf8c971b71350184c Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 24 Jul 2020 14:49:21 +0200
Subject: [PATCH 48/72] random_reassign

---
 sklearn/cluster/_kmeans.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index e140549ed99ff..8c6f119c398bc 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1648,7 +1648,8 @@ def fit(self, X, y=None, sample_weight=None):
                 # the choice is done as a function of the iteration index, and
                 # the minimum number of counts, in order to force this
                 # reassignment to happen every once in a while.
-                random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0
+                random_reassign = random_state.randint(
+                    10 * (1 + self._counts.min())) == 0
 
                 # Perform the actual update step on the minibatch data
                 batch_inertia = _mini_batch_step(

From 355627dd166a2316b1d07fcb64a7381c36b868a5 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Tue, 28 Jul 2020 12:09:49 +0200
Subject: [PATCH 49/72] wip

---
 sklearn/cluster/_kmeans.py            | 19 ++++++++++++++++---
 sklearn/cluster/tests/test_k_means.py |  7 ++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 8c6f119c398bc..9c6e88ef3201e 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1419,7 +1419,7 @@ class MiniBatchKMeans(KMeans):
     def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100,
                  batch_size=100, verbose=0, compute_labels=True,
                  random_state=None, tol=0.0, max_no_improvement=10,
-                 init_size=None, n_init=3, reassignment_ratio=0.01):
+                 init_size=None, n_init=3, reassignment_ratio=0.01, mode=0):
 
         super().__init__(
             n_clusters=n_clusters, init=init, max_iter=max_iter,
@@ -1430,6 +1430,7 @@ def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100,
         self.compute_labels = compute_labels
         self.init_size = init_size
         self.reassignment_ratio = reassignment_ratio
+        self.mode = mode
 
     @deprecated("The attribute 'counts_' is deprecated in 0.24"  # type: ignore
                 " and will be removed in 0.26.")
@@ -1648,8 +1649,15 @@ def fit(self, X, y=None, sample_weight=None):
                 # the choice is done as a function of the iteration index, and
                 # the minimum number of counts, in order to force this
                 # reassignment to happen every once in a while.
-                random_reassign = random_state.randint(
-                    10 * (1 + self._counts.min())) == 0
+
+                if self.mode == 0:
+                    random_reassign = random_state.randint(
+                        10 * (1 + self._counts.min())) == 0
+                elif self.mode == 1:
+                    random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0
+                elif self.mode == 2:
+                    random_reassign = ((i >= 10) *
+                                       random_state.choice([0, 1], p=[0.1, 0.9]))
 
                 # Perform the actual update step on the minibatch data
                 batch_inertia = _mini_batch_step(
@@ -1672,6 +1680,11 @@ def fit(self, X, y=None, sample_weight=None):
 
                 centers, centers_new = centers_new, centers
 
+                _, inertiaa = _labels_inertia_threadpool_limit(
+                    X, sample_weight, x_squared_norms, centers,
+                    n_threads=self._n_threads)
+                print(f"{inertiaa},")
+
                 # Monitor convergence and do early stopping if necessary
                 if self._mini_batch_convergence(
                         i, n_iter, n_samples, centers_squared_diff,
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 93eb1a9679c33..0dbbed97ccfac 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -708,12 +708,13 @@ def test_unit_weights_vs_no_weights(Estimator, data):
 def test_scaled_weights(Estimator, data):
     # Check that scaling all sample weights by a common factor
     # shouldn't change the result
-    sample_weight = np.random.uniform(n_samples)
+    data = np.random.random_sample((100000, 10))
+    sample_weight = np.random.RandomState(0).uniform(n_samples)
 
-    km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1)
+    km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1, max_no_improvement=None, init='random')
     km_orig = clone(km).fit(data, sample_weight=sample_weight)
     km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight)
-
+    assert False
     assert_array_equal(km_orig.labels_, km_scaled.labels_)
     assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_)
 

From d5a8935c90939ed34650c49907eb0f532ae3bf80 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Wed, 29 Jul 2020 13:13:36 +0200
Subject: [PATCH 50/72] wip

---
 sklearn/cluster/_kmeans.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 9c6e88ef3201e..fcc7b524e7498 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1638,6 +1638,8 @@ def fit(self, X, y=None, sample_weight=None):
         n_batches = int(np.ceil(float(n_samples) / self.batch_size))
         n_iter = int(self.max_iter * n_batches)
 
+        n_samples_seen_since_last_reassign = 0
+
         with threadpool_limits(limits=1, user_api="blas"):
             # Perform the iterative optimization until convergence
             for i in range(n_iter):
@@ -1649,7 +1651,7 @@ def fit(self, X, y=None, sample_weight=None):
                 # the choice is done as a function of the iteration index, and
                 # the minimum number of counts, in order to force this
                 # reassignment to happen every once in a while.
-
+                
                 if self.mode == 0:
                     random_reassign = random_state.randint(
                         10 * (1 + self._counts.min())) == 0
@@ -1657,7 +1659,16 @@ def fit(self, X, y=None, sample_weight=None):
                     random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0
                 elif self.mode == 2:
                     random_reassign = ((i >= 10) *
-                                       random_state.choice([0, 1], p=[0.1, 0.9]))
+                                       random_state.choice([0, 1], p=[0.9, 0.1]))
+                elif self.mode == 3:
+                    random_reassign = (i >= 10) * True
+                elif self.mode == 4:
+                    random_reassign = True
+                elif isinstance(self.mode, tuple):
+                    n_samples_seen_since_last_reassign += self.batch_size
+                    random_reassign = n_samples_seen_since_last_reassign >= (self.mode[0] * self.n_clusters)
+                    if random_reassign:
+                        n_samples_seen_since_last_reassign = 0
 
                 # Perform the actual update step on the minibatch data
                 batch_inertia = _mini_batch_step(

From 4d29bc312140109f267a8d75777f063382933bb5 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 31 Jul 2020 15:37:06 +0200
Subject: [PATCH 51/72] wip

---
 sklearn/cluster/_kmeans.py | 54 +++++++++++---------------------------
 1 file changed, 16 insertions(+), 38 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index fcc7b524e7498..4b92b9b123c5e 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1551,6 +1551,18 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
         self._no_improvement = no_improvement
         return False
 
+    def _random_reassign(self):
+        """Check if a random reassignment needs to be done.
+
+        Do random reassignments each time 10 * n_clusters samples have been
+        processed.
+        """
+        self._n_since_last_reassign += self.batch_size
+        if self._n_since_last_reassign >= (10 * self.n_clusters):
+            self._n_since_last_reassign = 0
+            return True
+        return False
+
     def fit(self, X, y=None, sample_weight=None):
         """Compute the centroids on X by chunking it into mini-batches.
 
@@ -1638,7 +1650,7 @@ def fit(self, X, y=None, sample_weight=None):
         n_batches = int(np.ceil(float(n_samples) / self.batch_size))
         n_iter = int(self.max_iter * n_batches)
 
-        n_samples_seen_since_last_reassign = 0
+        self._n_since_last_reassign = 0
 
         with threadpool_limits(limits=1, user_api="blas"):
             # Perform the iterative optimization until convergence
@@ -1647,29 +1659,6 @@ def fit(self, X, y=None, sample_weight=None):
                 minibatch_indices = random_state.randint(0, n_samples,
                                                          self.batch_size)
 
-                # Randomly choose whether to perform random reassignment:
-                # the choice is done as a function of the iteration index, and
-                # the minimum number of counts, in order to force this
-                # reassignment to happen every once in a while.
-                
-                if self.mode == 0:
-                    random_reassign = random_state.randint(
-                        10 * (1 + self._counts.min())) == 0
-                elif self.mode == 1:
-                    random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0
-                elif self.mode == 2:
-                    random_reassign = ((i >= 10) *
-                                       random_state.choice([0, 1], p=[0.9, 0.1]))
-                elif self.mode == 3:
-                    random_reassign = (i >= 10) * True
-                elif self.mode == 4:
-                    random_reassign = True
-                elif isinstance(self.mode, tuple):
-                    n_samples_seen_since_last_reassign += self.batch_size
-                    random_reassign = n_samples_seen_since_last_reassign >= (self.mode[0] * self.n_clusters)
-                    if random_reassign:
-                        n_samples_seen_since_last_reassign = 0
-
                 # Perform the actual update step on the minibatch data
                 batch_inertia = _mini_batch_step(
                     X=X[minibatch_indices],
@@ -1679,7 +1668,7 @@ def fit(self, X, y=None, sample_weight=None):
                     centers_new=centers_new,
                     weight_sums=self._counts,
                     random_state=random_state,
-                    random_reassign=random_reassign,
+                    random_reassign=self._random_reassign(),
                     reassignment_ratio=self.reassignment_ratio,
                     verbose=self.verbose,
                     n_threads=self._n_threads)
@@ -1691,11 +1680,6 @@ def fit(self, X, y=None, sample_weight=None):
 
                 centers, centers_new = centers_new, centers
 
-                _, inertiaa = _labels_inertia_threadpool_limit(
-                    X, sample_weight, x_squared_norms, centers,
-                    n_threads=self._n_threads)
-                print(f"{inertiaa},")
-
                 # Monitor convergence and do early stopping if necessary
                 if self._mini_batch_convergence(
                         i, n_iter, n_samples, centers_squared_diff,
@@ -1765,13 +1749,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
             # Initialize counts
             self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
 
-            random_reassign = False
-        else:
-            # The lower the minimum count is, the more we do random
-            # reassignment, however, we don't want to do random
-            # reassignment too often, to allow for building up counts
-            random_reassign = self._random_state.randint(
-                10 * (1 + self._counts.min())) == 0
+            self._n_since_last_reassign = 0
 
         with threadpool_limits(limits=1, user_api="blas"):
             _mini_batch_step(X,
@@ -1781,7 +1759,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
                              centers_new=self.cluster_centers_,
                              weight_sums=self._counts,
                              random_state=self._random_state,
-                             random_reassign=random_reassign,
+                             random_reassign=self._random_reassign(),
                              reassignment_ratio=self.reassignment_ratio,
                              verbose=self.verbose,
                              n_threads=self._n_threads)

From de3180e4ea4b3aba2b633831a3e7ddf6a07209da Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 31 Jul 2020 16:10:05 +0200
Subject: [PATCH 52/72] wip

---
 sklearn/cluster/_kmeans.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 2703dfc8fc4ea..5fd5d3c566c85 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1577,9 +1577,12 @@ def _random_reassign(self):
 
         Do random reassignments each time 10 * n_clusters samples have been
         processed.
+
+        If there are empty clusters we always want to reassign.
         """
         self._n_since_last_reassign += self.batch_size
-        if self._n_since_last_reassign >= (10 * self.n_clusters):
+        if ((self._counts == 0).any() or
+                self._n_since_last_reassign >= (10 * self.n_clusters)):
             self._n_since_last_reassign = 0
             return True
         return False

From a3b55b781e0f29f8bbf7d45bbc5f5a30534f29fe Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 31 Jul 2020 16:11:11 +0200
Subject: [PATCH 53/72] ellipsis

---
 sklearn/cluster/_kmeans.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 5fd5d3c566c85..5e244f2340fba 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1431,8 +1431,8 @@ class MiniBatchKMeans(KMeans):
     ...                          batch_size=6,
     ...                          max_iter=10).fit(X)
     >>> kmeans.cluster_centers_
-    array([[2.32394366, 1.16901408],
-           [3.4       , 4.36      ]])
+    array([[2.3..., 1.1...],
+           [3.4..., 4.3...]])
     >>> kmeans.predict([[0, 0], [4, 4]])
     array([0, 1], dtype=int32)
     """

From 4d06cc282a5c8d57aaaf14b9236e77fa5ed76cef Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 31 Jul 2020 16:19:33 +0200
Subject: [PATCH 54/72] idx

---
 sklearn/cluster/_k_means_minibatch.pyx | 46 +++++++++++++-------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index 6476336a4078b..942c058b07a92 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -159,23 +159,23 @@ def _minibatch_update_sparse(
         int[::1] X_indptr = X.indptr
         int n_samples = X.shape[0]
         int n_clusters = centers_old.shape[0]
-        int i
+        int cluster_idx
 
         int *indices
 
     with nogil, parallel(num_threads=n_threads):
         indices = <int*> malloc(n_samples * sizeof(int))
 
-        for i in prange(n_clusters, schedule="static"):
-            update_center_sparse(i, X_data, X_indices, X_indptr, sample_weight,
-                                 centers_old, centers_new, weight_sums, labels,
-                                 indices)
+        for cluster_idx in prange(n_clusters, schedule="static"):
+            update_center_sparse(cluster_idx, X_data, X_indices, X_indptr,
+                                 sample_weight, centers_old, centers_new,
+                                 weight_sums, labels, indices)
 
         free(indices)
 
 
 cdef void update_center_sparse(
-        int i,
+        int cluster_idx,
         floating[::1] X_data,          # IN
         int[::1] X_indices,            # IN
         int[::1] X_indptr,             # IN
@@ -191,38 +191,38 @@ cdef void update_center_sparse(
         int n_features = centers_old.shape[1]
         floating alpha, tmp
         int n_indices
-        int j, k, idx
+        int k, sample_idx, feature_idx
 
         floating wsum = 0
 
     # indices = np.where(labels == i)[0]
     k = 0
-    for j in range(n_samples):
-        if labels[j] == i:
-            indices[k] = j
-            wsum += sample_weight[j]
+    for sample_idx in range(n_samples):
+        if labels[sample_idx] == cluster_idx:
+            indices[k] = sample_idx
+            wsum += sample_weight[sample_idx]
             k += 1
     n_indices = k
 
     if wsum > 0:
         # Undo the previous count-based scaling for this cluster center:
-        for k in range(n_features):
-            centers_new[i, k] = centers_old[i, k] * weight_sums[i]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
 
         # Update cluster with new point members
-        for j in range(n_indices):
-            idx = indices[j]
-            for k in range(X_indptr[idx], X_indptr[idx + 1]):
-                centers_new[i, X_indices[k]] += X_data[k] * sample_weight[idx]
+        for k in range(n_indices):
+            sample_idx = indices[k]
+            for feature_idx in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
+                centers_new[cluster_idx, X_indices[feature_idx]] += X_data[feature_idx] * sample_weight[sample_idx]
 
         # Update the count statistics for this center
-        weight_sums[i] += wsum
+        weight_sums[cluster_idx] += wsum
 
         # Rescale to compute mean of all points (old and new)
-        alpha = 1 / weight_sums[i]
-        for k in range(n_features):
-            centers_new[i, k] *= alpha
+        alpha = 1 / weight_sums[cluster_idx]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] *= alpha
     else:
         # No sample was assigned to this cluster in this batch of data
-        for k in range(n_features):
-            centers_new[i, k] = centers_old[i, k]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]

From d354434753c9f1cc5037eeab0902df997c216bae Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 31 Jul 2020 16:34:54 +0200
Subject: [PATCH 55/72] wip

---
 sklearn/cluster/_kmeans.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 1ed631bc8e9ba..fa38f1762471c 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1431,8 +1431,8 @@ class MiniBatchKMeans(KMeans):
     ...                          batch_size=6,
     ...                          max_iter=10).fit(X)
     >>> kmeans.cluster_centers_
-    array([[2.3..., 1.1...],
-           [3.4..., 4.3...]])
+    array([[1.19..., 1.22...],
+           [4.03..., 2.46...]])
     >>> kmeans.predict([[0, 0], [4, 4]])
     array([0, 1], dtype=int32)
     """

From c6a0456bd0c0bd31693ae370bb05ed05c05460bc Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 31 Jul 2020 17:43:56 +0200
Subject: [PATCH 56/72] avoid calling openmp_effective_n_threads again

---
 sklearn/cluster/_kmeans.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index fa38f1762471c..fb1020f59a851 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -537,7 +537,7 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
 
 
 def _labels_inertia(X, sample_weight, x_squared_norms, centers,
-                    n_threads=None):
+                    n_threads=1):
     """E step of the K-means EM algorithm.
 
     Compute the labels and the inertia of the given samples and centers.
@@ -558,7 +558,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
     centers : ndarray of shape (n_clusters, n_features)
         The cluster centers.
 
-    n_threads : int, default=None
+    n_threads : int, default=1
         The number of OpenMP threads to use for the computation. Parallelism is
         sample-wise on the main cython loop which assigns each sample to its
         closest center.
@@ -574,8 +574,6 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
     n_samples = X.shape[0]
     n_clusters = centers.shape[0]
 
-    n_threads = _openmp_effective_n_threads(n_threads)
-
     labels = np.full(n_samples, -1, dtype=np.int32)
     weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype)
     center_shift = np.zeros_like(weight_in_clusters)
@@ -597,7 +595,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
 
 
 def _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms,
-                                     centers, n_threads=None):
+                                     centers, n_threads=1):
     """Same as _labels_inertia but in a threadpool_limits context."""
     with threadpool_limits(limits=1, user_api="blas"):
         labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms,

From 9c9303738ebc23743d2b34181bb91f0b372e7288 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Tue, 4 Aug 2020 18:03:30 +0200
Subject: [PATCH 57/72] cln

---
 sklearn/cluster/tests/test_k_means.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index ad2d592667b05..6fc2f05d20071 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -793,7 +793,6 @@ def test_unit_weights_vs_no_weights(Estimator, data):
 def test_scaled_weights(Estimator, data):
     # Check that scaling all sample weights by a common factor
     # shouldn't change the result
-    data = np.random.random_sample((100000, 10))
     sample_weight = np.random.RandomState(0).uniform(n_samples)
 
     km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1)

From d263d308e67721ea606661c9e3149bfab17d5e23 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Fri, 23 Oct 2020 17:08:35 +0200
Subject: [PATCH 58/72] fix merging mistake

---
 sklearn/cluster/tests/test_k_means.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index f4b2497bb0bcd..6fc2f05d20071 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -551,6 +551,7 @@ def test_predict(Estimator, algorithm, init, dtype, array_constr):
 
     # re-predict labels for training set using fit_predict
     pred = km.fit_predict(X)
+    assert_allclose(v_measure_score(pred, labels), 1)
 
     # predict centroid labels
     pred = km.predict(km.cluster_centers_)

From b14492aafe4e82f7ea789dcc9e0ca080f7e6f9ed Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Tue, 3 Nov 2020 15:10:27 +0100
Subject: [PATCH 59/72] merge master

---
 sklearn/cluster/_k_means_elkan.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
index eaa37fc513291..4fa9f61d54646 100644
--- a/sklearn/cluster/_k_means_elkan.pyx
+++ b/sklearn/cluster/_k_means_elkan.pyx
@@ -18,7 +18,7 @@ from libc.stdlib cimport calloc, free
 from libc.string cimport memset, memcpy
 
 from ..utils.extmath import row_norms
-from ._k_means_fast import CHUNK_SIZE
+from ._k_means_common import CHUNK_SIZE
 from ._k_means_common cimport _relocate_empty_clusters_dense
 from ._k_means_common cimport _relocate_empty_clusters_sparse
 from ._k_means_common cimport _euclidean_dense_dense

From a3e1b11e80c58b3c5ea51ce8ff20268af7e3fda3 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Tue, 3 Nov 2020 15:21:09 +0100
Subject: [PATCH 60/72] change batch_size default

---
 sklearn/cluster/_kmeans.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index d980c839cafdc..504dbacfbf231 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1342,8 +1342,12 @@ class MiniBatchKMeans(KMeans):
         Maximum number of iterations over the complete dataset before
         stopping independently of any early stopping criterion heuristics.
 
-    batch_size : int, default=100
+    batch_size : int, default=1024
         Size of the mini batches.
+        For faster compuations, you can set the ``batch_size`` greater than
+        256 * number of cores to enable parallelism on all cores.
+
+        .. versionchanged:: XXX
 
     verbose : int, default=0
         Verbosity mode.

From 10695c65a781f13a85a521c345dbde721b81c936 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Tue, 3 Nov 2020 17:24:00 +0100
Subject: [PATCH 61/72] actually change batch size

---
 sklearn/cluster/_kmeans.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 504dbacfbf231..a7fc462851310 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1472,7 +1472,7 @@ class MiniBatchKMeans(KMeans):
     """
     @_deprecate_positional_args
     def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100,
-                 batch_size=100, verbose=0, compute_labels=True,
+                 batch_size=1024, verbose=0, compute_labels=True,
                  random_state=None, tol=0.0, max_no_improvement=10,
                  init_size=None, n_init=3, reassignment_ratio=0.01, mode=0):
 
@@ -1518,6 +1518,7 @@ def _check_params(self, X):
         if self.batch_size <= 0:
             raise ValueError(
                 f"batch_size should be > 0, got {self.batch_size} instead.")
+        self._batch_size = min(self.batch_size, X.shape[0])
 
         # init_size
         if self.init_size is not None and self.init_size <= 0:
@@ -1525,7 +1526,7 @@ def _check_params(self, X):
                 f"init_size should be > 0, got {self.init_size} instead.")
         self._init_size = self.init_size
         if self._init_size is None:
-            self._init_size = 3 * self.batch_size
+            self._init_size = 3 * self._batch_size
             if self._init_size < self.n_clusters:
                 self._init_size = 3 * self.n_clusters
         elif self._init_size < self.n_clusters:
@@ -1548,7 +1549,7 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
         """Helper function to encapsulate the early stopping logic"""
         # Normalize inertia to be able to compare values when
         # batch_size changes
-        batch_inertia /= self.batch_size
+        batch_inertia /= self._batch_size
 
         # Ignore first iteration because it's inertia from initialization.
         if iteration_idx == 0:
@@ -1564,7 +1565,7 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
         if ewa_inertia is None:
             ewa_inertia = batch_inertia
         else:
-            alpha = self.batch_size * 2.0 / (n_samples + 1)
+            alpha = self._batch_size * 2.0 / (n_samples + 1)
             alpha = min(alpha, 1)
             ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha
 
@@ -1614,7 +1615,7 @@ def _random_reassign(self):
 
         If there are empty clusters we always want to reassign.
         """
-        self._n_since_last_reassign += self.batch_size
+        self._n_since_last_reassign += self._batch_size
         if ((self._counts == 0).any() or
                 self._n_since_last_reassign >= (10 * self.n_clusters)):
             self._n_since_last_reassign = 0
@@ -1661,7 +1662,7 @@ def fit(self, X, y=None, sample_weight=None):
             init = check_array(init, dtype=X.dtype, copy=True, order='C')
             self._validate_center_shape(X, init)
 
-        self._check_mkl_vcomp(X, self.batch_size)
+        self._check_mkl_vcomp(X, self._batch_size)
 
         # precompute squared norms of data points
         x_squared_norms = row_norms(X, squared=True)
@@ -1711,7 +1712,7 @@ def fit(self, X, y=None, sample_weight=None):
         # Initialize number of samples seen since last reassignment
         self._n_since_last_reassign = 0
 
-        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
+        n_batches = int(np.ceil(float(n_samples) / self._batch_size))
         n_iter = int(self.max_iter * n_batches)
 
         with threadpool_limits(limits=1, user_api="blas"):
@@ -1719,7 +1720,7 @@ def fit(self, X, y=None, sample_weight=None):
             for i in range(n_iter):
                 # Sample a minibatch from the full dataset
                 minibatch_indices = random_state.randint(0, n_samples,
-                                                         self.batch_size)
+                                                         self._batch_size)
 
                 # Perform the actual update step on the minibatch data
                 batch_inertia = _mini_batch_step(

From e4e15f5c9727d7316914a35a375d7be0156e8080 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 13 Nov 2020 12:53:04 +0100
Subject: [PATCH 62/72] reassignment_ratio docstring

---
 sklearn/cluster/_kmeans.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 61798c71b4638..48d5c9328f9fc 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1492,11 +1492,12 @@ class MiniBatchKMeans(KMeans):
         best of the ``n_init`` initializations as measured by inertia.
 
     reassignment_ratio : float, default=0.01
-        Control the fraction of the maximum number of counts for a
-        center to be reassigned. A higher value means that low count
-        centers are more easily reassigned, which means that the
-        model will take longer to converge, but should converge in a
-        better clustering.
+        Control the fraction of the maximum number of counts for a center to
+        be reassigned. A higher value means that low count centers are more
+        easily reassigned, which means that the model will take longer to
+        converge, but should converge in a better clustering. A too high value
+        may however cause convergence issues, especially with a small batch
+        size.
 
     Attributes
     ----------

From 5f4f065dc7765ef83396ced797c0af253462a915 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 28 Jan 2021 16:01:58 +0100
Subject: [PATCH 63/72] cln

---
 sklearn/cluster/_kmeans.py | 89 ++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 487e67c8d35b1..6ea0b5fd6b421 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1440,7 +1440,8 @@ class MiniBatchKMeans(KMeans):
         For faster compuations, you can set the ``batch_size`` greater than
         256 * number of cores to enable parallelism on all cores.
 
-        .. versionchanged:: XXX
+        .. versionchanged:: 1.0
+           `batch_size` default changed from 100 to 1024.
 
     verbose : int, default=0
         Verbosity mode.
@@ -1510,7 +1511,12 @@ class MiniBatchKMeans(KMeans):
         center, weighted by the sample weights if provided.
 
     n_iter_ : int
-        Number of batches processed.
+        Number of iterations over the full dataset.
+
+    n_steps_ : int
+        Number of minibatches processed.
+
+        .. versionadded:: 1.0
 
     counts_ : ndarray of shape (n_clusters,)
         Weigth sum of each cluster.
@@ -1570,7 +1576,7 @@ class MiniBatchKMeans(KMeans):
     def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100,
                  batch_size=1024, verbose=0, compute_labels=True,
                  random_state=None, tol=0.0, max_no_improvement=10,
-                 init_size=None, n_init=3, reassignment_ratio=0.01, mode=0):
+                 init_size=None, n_init=3, reassignment_ratio=0.01):
 
         super().__init__(
             n_clusters=n_clusters, init=init, max_iter=max_iter,
@@ -1581,7 +1587,6 @@ def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100,
         self.compute_labels = compute_labels
         self.init_size = init_size
         self.reassignment_ratio = reassignment_ratio
-        self.mode = mode
 
     @deprecated("The attribute 'counts_' is deprecated in 0.24"  # type: ignore
                 " and will be removed in 1.1 (renaming of 0.26).")
@@ -1640,67 +1645,63 @@ def _check_params(self, X):
                 f"reassignment_ratio should be >= 0, got "
                 f"{self.reassignment_ratio} instead.")
 
-    def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples,
+    def _mini_batch_convergence(self, step, n_steps, n_samples,
                                 centers_squared_diff, batch_inertia):
         """Helper function to encapsulate the early stopping logic"""
         # Normalize inertia to be able to compare values when
         # batch_size changes
         batch_inertia /= self._batch_size
 
+        # count steps starting from 1 for user friendly verbose mode.
+        step = step + 1
+
         # Ignore first iteration because it's inertia from initialization.
-        if iteration_idx == 0:
+        if step == 1:
             if self.verbose:
-                print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: "
-                      f"mean batch inertia: {batch_inertia}")
+                print(f"Minibatch step {step}/{n_steps}: mean batch "
+                      f"inertia: {batch_inertia}")
             return False
 
         # Compute an Exponentially Weighted Average of the inertia to
         # monitor the convergence while discarding minibatch-local stochastic
         # variability: https://en.wikipedia.org/wiki/Moving_average
-        ewa_inertia = self._ewa_inertia
-        if ewa_inertia is None:
-            ewa_inertia = batch_inertia
+        if self._ewa_inertia is None:
+            self._ewa_inertia = batch_inertia
         else:
             alpha = self._batch_size * 2.0 / (n_samples + 1)
             alpha = min(alpha, 1)
-            ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha
+            self._ewa_inertia = (
+                self._ewa_inertia * (1 - alpha) + batch_inertia * alpha)
 
         # Log progress to be able to monitor convergence
         if self.verbose:
-            print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: "
-                  f"mean batch inertia: {batch_inertia}, ewa inertia: "
-                  f"{ewa_inertia}")
+            print(f"Minibatch step {step}/{n_steps}: mean batch inertia: "
+                  f"{batch_inertia}, ewa inertia: {self._ewa_inertia}")
 
         # Early stopping based on absolute tolerance on squared change of
-        # centers position (using EWA smoothing)
+        # centers position
         if self._tol > 0.0 and centers_squared_diff <= self._tol:
             if self.verbose:
-                print(f"Converged (small centers change) at iteration "
-                      f"{iteration_idx + 1}/{n_iter}")
+                print(f"Converged (small centers change) at step "
+                      f"{step}/{n_steps}")
             return True
 
         # Early stopping heuristic due to lack of improvement on smoothed
         # inertia
-        ewa_inertia_min = self._ewa_inertia_min
-        no_improvement = self._no_improvement
-        if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min:
-            no_improvement = 0
-            ewa_inertia_min = ewa_inertia
+        if (self._ewa_inertia_min is None or
+                self._ewa_inertia < self._ewa_inertia_min):
+            self._no_improvement = 0
+            self._ewa_inertia_min = self._ewa_inertia
         else:
-            no_improvement += 1
+            self._no_improvement += 1
 
         if (self.max_no_improvement is not None
-                and no_improvement >= self.max_no_improvement):
+                and self._no_improvement >= self.max_no_improvement):
             if self.verbose:
-                print(f"Converged (lack of improvement in inertia) at "
-                      f"iteration {iteration_idx}/{n_iter}")
+                print(f"Converged (lack of improvement in inertia) at step "
+                      f"{step}/{n_steps}")
             return True
 
-        # update the convergence context to maintain state across successive
-        # calls:
-        self._ewa_inertia = ewa_inertia
-        self._ewa_inertia_min = ewa_inertia_min
-        self._no_improvement = no_improvement
         return False
 
     def _random_reassign(self):
@@ -1770,7 +1771,7 @@ def fit(self, X, y=None, sample_weight=None):
         sample_weight_valid = sample_weight[validation_indices]
         x_squared_norms_valid = x_squared_norms[validation_indices]
 
-        # perform several inits with random sub-sets
+        # perform several inits with random subsets
         best_inertia = None
         for init_idx in range(self._n_init):
             if self.verbose:
@@ -1808,12 +1809,12 @@ def fit(self, X, y=None, sample_weight=None):
         # Initialize number of samples seen since last reassignment
         self._n_since_last_reassign = 0
 
-        n_batches = int(np.ceil(float(n_samples) / self._batch_size))
-        n_iter = int(self.max_iter * n_batches)
+        n_steps_per_epoch = int(np.ceil(n_samples / self._batch_size))
+        n_steps = self.max_iter * n_steps_per_epoch
 
         with threadpool_limits(limits=1, user_api="blas"):
             # Perform the iterative optimization until convergence
-            for i in range(n_iter):
+            for i in range(n_steps):
                 # Sample a minibatch from the full dataset
                 minibatch_indices = random_state.randint(0, n_samples,
                                                          self._batch_size)
@@ -1841,13 +1842,14 @@ def fit(self, X, y=None, sample_weight=None):
 
                 # Monitor convergence and do early stopping if necessary
                 if self._mini_batch_convergence(
-                        i, n_iter, n_samples, centers_squared_diff,
+                        i, n_steps, n_samples, centers_squared_diff,
                         batch_inertia):
                     break
 
         self.cluster_centers_ = centers
 
-        self.n_iter_ = i + 1
+        self.n_steps_ = i + 1
+        self.n_iter_ = (i + 1) // n_steps_per_epoch
 
         if self.compute_labels:
             self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
@@ -1876,22 +1878,23 @@ def partial_fit(self, X, y=None, sample_weight=None):
         -------
         self
         """
-        is_first_call_to_partial_fit = not hasattr(self, 'cluster_centers_')
+        has_centers = hasattr(self, 'cluster_centers_')
 
         X = self._validate_data(X, accept_sparse='csr',
                                 dtype=[np.float64, np.float32],
                                 order='C', accept_large_sparse=False,
-                                reset=is_first_call_to_partial_fit)
+                                reset=not has_centers)
 
         self._random_state = getattr(self, "_random_state",
                                      check_random_state(self.random_state))
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        self.n_steps_ = getattr(self, "n_steps_", 0)
 
         # precompute squared norms of data points
         x_squared_norms = row_norms(X, squared=True)
 
-        if is_first_call_to_partial_fit:
-            # this is the first call to partial_fit on this object
+        if not has_centers:
+            # this instance has not been fitted yet (fit or partial_fit)
             self._check_params(X)
 
             # Validate init array
@@ -1931,6 +1934,8 @@ def partial_fit(self, X, y=None, sample_weight=None):
                 X, sample_weight, x_squared_norms, self.cluster_centers_,
                 n_threads=self._n_threads)
 
+        self.n_steps_ += 1
+
         return self
 
     def predict(self, X, sample_weight=None):

From ab4310a579e6ce7895f900e903037f684f35e1e2 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 28 Jan 2021 18:03:51 +0100
Subject: [PATCH 64/72] make n_iter_ count number of started epochs

---
 sklearn/cluster/_kmeans.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 6ea0b5fd6b421..59a4fc2afd529 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1849,7 +1849,7 @@ def fit(self, X, y=None, sample_weight=None):
         self.cluster_centers_ = centers
 
         self.n_steps_ = i + 1
-        self.n_iter_ = (i + 1) // n_steps_per_epoch
+        self.n_iter_ = np.ceil((i + 1) / n_steps_per_epoch)
 
         if self.compute_labels:
             self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(

From 5aafed3666da087f84a61ecac6355448f8da51bd Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 5 Feb 2021 12:34:25 +0100
Subject: [PATCH 65/72] improve tests and docs

---
 sklearn/cluster/_kmeans.py            |  7 +++---
 sklearn/cluster/tests/test_k_means.py | 31 +++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 59a4fc2afd529..79d222036394b 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1352,6 +1352,8 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new,
     -------
     inertia : float
         Sum of squared distances of samples to their closest cluster center.
+        The inertia is computed after finding the labels and before updating
+        the centers.
     """
     # Perform label assignment to nearest centers
     labels, inertia = _labels_inertia(X, sample_weight,
@@ -1809,8 +1811,7 @@ def fit(self, X, y=None, sample_weight=None):
         # Initialize number of samples seen since last reassignment
         self._n_since_last_reassign = 0
 
-        n_steps_per_epoch = int(np.ceil(n_samples / self._batch_size))
-        n_steps = self.max_iter * n_steps_per_epoch
+        n_steps = (self.max_iter * n_samples) // self._batch_size
 
         with threadpool_limits(limits=1, user_api="blas"):
             # Perform the iterative optimization until convergence
@@ -1849,7 +1850,7 @@ def fit(self, X, y=None, sample_weight=None):
         self.cluster_centers_ = centers
 
         self.n_steps_ = i + 1
-        self.n_iter_ = np.ceil((i + 1) / n_steps_per_epoch)
+        self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples))
 
         if self.compute_labels:
             self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index a8b462cb825a9..c0bf79efc85ec 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -467,6 +467,37 @@ def test_minibatch_kmeans_init_size():
     assert km._init_size == n_samples
 
 
+@pytest.mark.parametrize("tol, max_no_improvement", [(1e-4, None), (0, 10)])
+def test_minibatch_declared_convergence(tol, max_no_improvement):
+    # Check that convergence based on small center change is achievable.
+    X, _, centers = make_blobs(centers=3, random_state=0, return_centers=True)
+
+    km = MiniBatchKMeans(n_clusters=3, init=centers, batch_size=20, tol=tol,
+                         random_state=0, max_iter=10,
+                         max_no_improvement=max_no_improvement)
+
+    km.fit(X)
+    assert 1 < km.n_iter_ < 10
+
+
+def test_minibatch_iter_steps():
+    # Check consistency of n_iter_ and n_steps_ attributes.
+    batch_size = 30
+    n_samples = X.shape[0]
+    km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size,
+                         random_state=0).fit(X)
+
+    # n_iter_ is the number of started epochs
+    assert km.n_iter_ == np.ceil((km.n_steps_ * batch_size) / n_samples)
+
+    # without stopping condition, max_iter should be reached
+    km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size, random_state=0,
+                         tol=0, max_no_improvement=None, max_iter=10).fit(X)
+
+    assert km.n_iter_ == 10
+    assert km.n_steps_ == (10 * n_samples) // batch_size
+
+
 def test_kmeans_copyx():
     # Check that copy_x=False returns nearly equal X after de-centering.
     my_X = X.copy()

From 0a71a92eef2e58a72c7405f8d7dee3357b8781d3 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 12 Mar 2021 15:21:43 +0100
Subject: [PATCH 66/72] don't move kmpp

---
 sklearn/cluster/_kmeans.py | 187 +++++++++++++++++++------------------
 1 file changed, 94 insertions(+), 93 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 79d222036394b..c77fd14f3faaa 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -47,99 +47,6 @@
 ###############################################################################
 # Initialization heuristic
 
-def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None,
-                    random_state=None, n_local_trials=None):
-    """Init n_clusters seeds according to k-means++
-
-    .. versionadded:: 0.24
-
-    Parameters
-    ----------
-    X : {array-like, sparse matrix} of shape (n_samples, n_features)
-        The data to pick seeds from.
-
-    n_clusters : int
-        The number of centroids to initialize
-
-    x_squared_norms : array-like of shape (n_samples,), default=None
-        Squared Euclidean norm of each data point.
-
-    random_state : int or RandomState instance, default=None
-        Determines random number generation for centroid initialization. Pass
-        an int for reproducible output across multiple function calls.
-        See :term:`Glossary <random_state>`.
-
-    n_local_trials : int, default=None
-        The number of seeding trials for each center (except the first),
-        of which the one reducing inertia the most is greedily chosen.
-        Set to None to make the number of trials depend logarithmically
-        on the number of seeds (2+log(k)).
-
-    Returns
-    -------
-    centers : ndarray of shape (n_clusters, n_features)
-        The inital centers for k-means.
-
-    indices : ndarray of shape (n_clusters,)
-        The index location of the chosen centers in the data array X. For a
-        given index and center, X[index] = center.
-
-    Notes
-    -----
-    Selects initial cluster centers for k-mean clustering in a smart way
-    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
-    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
-    on Discrete algorithms. 2007
-
-    Examples
-    --------
-
-    >>> from sklearn.cluster import kmeans_plusplus
-    >>> import numpy as np
-    >>> X = np.array([[1, 2], [1, 4], [1, 0],
-    ...               [10, 2], [10, 4], [10, 0]])
-    >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)
-    >>> centers
-    array([[10,  4],
-           [ 1,  0]])
-    >>> indices
-    array([4, 2])
-    """
-
-    # Check data
-    check_array(X, accept_sparse='csr',
-                dtype=[np.float64, np.float32])
-
-    if X.shape[0] < n_clusters:
-        raise ValueError(f"n_samples={X.shape[0]} should be >= "
-                         f"n_clusters={n_clusters}.")
-
-    # Check parameters
-    if x_squared_norms is None:
-        x_squared_norms = row_norms(X, squared=True)
-    else:
-        x_squared_norms = check_array(x_squared_norms,
-                                      dtype=X.dtype,
-                                      ensure_2d=False)
-
-    if x_squared_norms.shape[0] != X.shape[0]:
-        raise ValueError(
-            f"The length of x_squared_norms {x_squared_norms.shape[0]} should "
-            f"be equal to the length of n_samples {X.shape[0]}.")
-
-    if n_local_trials is not None and n_local_trials < 1:
-        raise ValueError(
-            f"n_local_trials is set to {n_local_trials} but should be an "
-            f"integer value greater than zero.")
-
-    random_state = check_random_state(random_state)
-
-    # Call private k-means++
-    centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms,
-                                        random_state, n_local_trials)
-
-    return centers, indices
-
 
 def _kmeans_plusplus(X, n_clusters, x_squared_norms,
                      random_state, n_local_trials=None):
@@ -1979,3 +1886,97 @@ def _more_tags(self):
                 'zero sample_weight is not equivalent to removing samples',
             }
         }
+
+
+def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None,
+                    random_state=None, n_local_trials=None):
+    """Init n_clusters seeds according to k-means++
+
+    .. versionadded:: 0.24
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to pick seeds from.
+
+    n_clusters : int
+        The number of centroids to initialize
+
+    x_squared_norms : array-like of shape (n_samples,), default=None
+        Squared Euclidean norm of each data point.
+
+    random_state : int or RandomState instance, default=None
+        Determines random number generation for centroid initialization. Pass
+        an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_local_trials : int, default=None
+        The number of seeding trials for each center (except the first),
+        of which the one reducing inertia the most is greedily chosen.
+        Set to None to make the number of trials depend logarithmically
+        on the number of seeds (2+log(k)).
+
+    Returns
+    -------
+    centers : ndarray of shape (n_clusters, n_features)
+        The inital centers for k-means.
+
+    indices : ndarray of shape (n_clusters,)
+        The index location of the chosen centers in the data array X. For a
+        given index and center, X[index] = center.
+
+    Notes
+    -----
+    Selects initial cluster centers for k-mean clustering in a smart way
+    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
+    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
+    on Discrete algorithms. 2007
+
+    Examples
+    --------
+
+    >>> from sklearn.cluster import kmeans_plusplus
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [10, 2], [10, 4], [10, 0]])
+    >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)
+    >>> centers
+    array([[10,  4],
+           [ 1,  0]])
+    >>> indices
+    array([4, 2])
+    """
+
+    # Check data
+    check_array(X, accept_sparse='csr',
+                dtype=[np.float64, np.float32])
+
+    if X.shape[0] < n_clusters:
+        raise ValueError(f"n_samples={X.shape[0]} should be >= "
+                         f"n_clusters={n_clusters}.")
+
+    # Check parameters
+    if x_squared_norms is None:
+        x_squared_norms = row_norms(X, squared=True)
+    else:
+        x_squared_norms = check_array(x_squared_norms,
+                                      dtype=X.dtype,
+                                      ensure_2d=False)
+
+    if x_squared_norms.shape[0] != X.shape[0]:
+        raise ValueError(
+            f"The length of x_squared_norms {x_squared_norms.shape[0]} should "
+            f"be equal to the length of n_samples {X.shape[0]}.")
+
+    if n_local_trials is not None and n_local_trials < 1:
+        raise ValueError(
+            f"n_local_trials is set to {n_local_trials} but should be an "
+            f"integer value greater than zero.")
+
+    random_state = check_random_state(random_state)
+
+    # Call private k-means++
+    centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms,
+                                        random_state, n_local_trials)
+
+    return centers, indices

From eacb6cc582b99949d5214b0ff0e3c525c5bbb8ac Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 12 Mar 2021 17:40:36 +0100
Subject: [PATCH 67/72] address comments

---
 sklearn/cluster/_k_means_minibatch.pyx |  8 ++++----
 sklearn/cluster/_kmeans.py             | 10 +++++++---
 sklearn/cluster/tests/test_k_means.py  | 15 ++++++++++++---
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index 942c058b07a92..1f52625279aef 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -74,7 +74,7 @@ cdef void update_center_dense(
         floating[:, ::1] centers_new,  # OUT
         floating[::1] weight_sums,     # INOUT
         int[::1] labels,               # IN
-        int *indices) nogil:           # OUT
+        int *indices) nogil:           # TMP
     """Update of a single center for dense MinibatchKMeans"""
     cdef:
         int n_samples = sample_weight.shape[0]
@@ -85,7 +85,7 @@ cdef void update_center_dense(
 
         floating wsum = 0
 
-    # indices = np.where(labels == i)[0]
+    # indices = np.where(labels == cluster_idx)[0]
     k = 0
     for sample_idx in range(n_samples):
         if labels[sample_idx] == cluster_idx:
@@ -184,7 +184,7 @@ cdef void update_center_sparse(
         floating[:, ::1] centers_new,  # OUT
         floating[::1] weight_sums,     # INOUT
         int[::1] labels,               # IN
-        int *indices) nogil:           # OUT
+        int *indices) nogil:           # TMP
     """Update of a single center for sparse MinibatchKMeans"""
     cdef:
         int n_samples = sample_weight.shape[0]
@@ -195,7 +195,7 @@ cdef void update_center_sparse(
 
         floating wsum = 0
 
-    # indices = np.where(labels == i)[0]
+    # indices = np.where(labels == cluster_idx)[0]
     k = 0
     for sample_idx in range(n_samples):
         if labels[sample_idx] == cluster_idx:
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index c77fd14f3faaa..7df9c9a2d33b8 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1415,9 +1415,11 @@ class MiniBatchKMeans(KMeans):
 
     inertia_ : float
         The value of the inertia criterion associated with the chosen
-        partition (if compute_labels is set to True). The inertia is
-        defined as the sum of square distances of samples to their cluster
-        center, weighted by the sample weights if provided.
+        partition if compute_labels is set to True. If compute_labels is set to
+        False, it's an approximation of the inertia based on an exponentially
+        weighted average of the batch inertiae.
+        The inertia is defined as the sum of square distances of samples to
+        their cluster center, weighted by the sample weights if provided.
 
     n_iter_ : int
         Number of iterations over the full dataset.
@@ -1763,6 +1765,8 @@ def fit(self, X, y=None, sample_weight=None):
             self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
                 X, sample_weight, x_squared_norms, self.cluster_centers_,
                 n_threads=self._n_threads)
+        else:
+            self.inertia_ = self._ewa_inertia * n_samples
 
         return self
 
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index c0bf79efc85ec..fccadd68e821c 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -468,17 +468,24 @@ def test_minibatch_kmeans_init_size():
 
 
 @pytest.mark.parametrize("tol, max_no_improvement", [(1e-4, None), (0, 10)])
-def test_minibatch_declared_convergence(tol, max_no_improvement):
-    # Check that convergence based on small center change is achievable.
+def test_minibatch_declared_convergence(capsys, tol, max_no_improvement):
+    # Check convergence detection based on ewa batch inertia or on
+    # small center change.
     X, _, centers = make_blobs(centers=3, random_state=0, return_centers=True)
 
     km = MiniBatchKMeans(n_clusters=3, init=centers, batch_size=20, tol=tol,
-                         random_state=0, max_iter=10,
+                         random_state=0, max_iter=10, verbose=1,
                          max_no_improvement=max_no_improvement)
 
     km.fit(X)
     assert 1 < km.n_iter_ < 10
 
+    captured = capsys.readouterr()
+    if max_no_improvement is None:
+        assert "Converged (small centers change)" in captured.out
+    if tol == 0:
+        assert "Converged (lack of improvement in inertia)" in captured.out
+
 
 def test_minibatch_iter_steps():
     # Check consistency of n_iter_ and n_steps_ attributes.
@@ -489,6 +496,7 @@ def test_minibatch_iter_steps():
 
     # n_iter_ is the number of started epochs
     assert km.n_iter_ == np.ceil((km.n_steps_ * batch_size) / n_samples)
+    assert isinstance(km.n_iter_, int)
 
     # without stopping condition, max_iter should be reached
     km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size, random_state=0,
@@ -496,6 +504,7 @@ def test_minibatch_iter_steps():
 
     assert km.n_iter_ == 10
     assert km.n_steps_ == (10 * n_samples) // batch_size
+    assert isinstance(km.n_steps_, int)
 
 
 def test_kmeans_copyx():

From c71b5b5beaeac716028339516ac9de7d5386cb86 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 12 Mar 2021 18:10:44 +0100
Subject: [PATCH 68/72] add what's new entry

---
 doc/whats_new/v1.0.rst | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index a00523ec2223b..a1c68d5132e6a 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -56,9 +56,27 @@ Changelog
   in multicore settings. :pr:`19052` by
   :user:`Yusuke Nagasaka <YusukeNagasaka>`.
 
+- |Efficiency| :class:`cluster.MiniBatchKMeans` is now faster in multicore
+  settings. :pr:`17622` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 - |Fix| Fixes incorrect multiple data-conversion warnings when clustering
   boolean data. :pr:`19046` by :user:`Surya Prakash <jdsurya>`.
 
+- |Fix| Fixed a bug in :class:`cluster.MiniBatchKMeans` where the sample
+  weights were partially ignored when the input is sparse. :pr:`17622` by
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+- |Fix| Improved convergence detection based on center change in
+  :class:`cluster.MiniBatchKMeans` which was almost never achievable.
+  :pr:`17622` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+- |API| the default value for the `batch_size` parameter of
+  :class:`MiniBatchKMeans` was changed from 100 to 1024 due to efficiency
+  reasons. The `n_iter_` attribute of :class:`MiniBatchKMeans` now reports the
+  number of started epochs and the `n_steps_` attribute reports the number of
+  mini batches processed. :pr:`17622`
+  by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 :mod:`sklearn.ensemble`
 .......................
 

From 5d4e3d9faafc307e0ed58ddde50f480d6eb224a3 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 12 Mar 2021 18:13:20 +0100
Subject: [PATCH 69/72] remove warning in test

---
 sklearn/cluster/tests/test_k_means.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index fccadd68e821c..3cb9e395ab743 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -474,7 +474,7 @@ def test_minibatch_declared_convergence(capsys, tol, max_no_improvement):
     X, _, centers = make_blobs(centers=3, random_state=0, return_centers=True)
 
     km = MiniBatchKMeans(n_clusters=3, init=centers, batch_size=20, tol=tol,
-                         random_state=0, max_iter=10, verbose=1,
+                         random_state=0, max_iter=10, n_init=1, verbose=1,
                          max_no_improvement=max_no_improvement)
 
     km.fit(X)
@@ -485,7 +485,7 @@ def test_minibatch_declared_convergence(capsys, tol, max_no_improvement):
         assert "Converged (small centers change)" in captured.out
     if tol == 0:
         assert "Converged (lack of improvement in inertia)" in captured.out
-
+    
 
 def test_minibatch_iter_steps():
     # Check consistency of n_iter_ and n_steps_ attributes.

From ebefe18b1d74353e0ee10ae7d7edf777bcde6430 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 12 Mar 2021 18:20:49 +0100
Subject: [PATCH 70/72] lint

---
 sklearn/cluster/tests/test_k_means.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 3cb9e395ab743..a56c2d8e55d8e 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -485,7 +485,7 @@ def test_minibatch_declared_convergence(capsys, tol, max_no_improvement):
         assert "Converged (small centers change)" in captured.out
     if tol == 0:
         assert "Converged (lack of improvement in inertia)" in captured.out
-    
+
 
 def test_minibatch_iter_steps():
     # Check consistency of n_iter_ and n_steps_ attributes.

From be0c9487141901d895fcb3093016a2918555990a Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 7 Apr 2021 19:09:30 +0200
Subject: [PATCH 71/72] adress comments

---
 sklearn/cluster/_k_means_minibatch.pyx |  6 +++---
 sklearn/cluster/_kmeans.py             | 17 +++++++++++------
 sklearn/cluster/tests/test_k_means.py  |  4 ++--
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index 1f52625279aef..ab5aee35ea075 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -2,7 +2,7 @@
 
 # TODO: We still need to use ndarrays instead of typed memoryviews when using
 # fused types and when the array may be read-only (for instance when it's
-# provided by the user). This is fixed in cython > 0.3.
+# provided by the user). This will be fixed in cython >= 0.3.
 
 cimport numpy as np
 from cython cimport floating
@@ -79,7 +79,7 @@ cdef void update_center_dense(
     cdef:
         int n_samples = sample_weight.shape[0]
         int n_features = centers_old.shape[1]
-        floating alpha, tmp
+        floating alpha
         int n_indices
         int k, sample_idx, feature_idx
 
@@ -189,7 +189,7 @@ cdef void update_center_sparse(
     cdef:
         int n_samples = sample_weight.shape[0]
         int n_features = centers_old.shape[1]
-        floating alpha, tmp
+        floating alpha
         int n_indices
         int k, sample_idx, feature_idx
 
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 1ccff8fe8e454..e9f952b58cb1b 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1309,7 +1309,7 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new,
     ----------
 
     X : {ndarray, sparse matrix} of shape (n_samples, n_features)
-        The original data array. In sparse, must be in CSR format.
+        The original data array. If sparse, must be in CSR format.
 
     x_squared_norms : ndarray of shape (n_samples,)
         Squared euclidean norm of each data point.
@@ -1356,6 +1356,8 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new,
         the centers.
     """
     # Perform label assignment to nearest centers
+    # For better efficiency, it's better to run _mini_batch_step in a
+    # threadpool_limit context then using _labels_inertia_threadpool_limit here
     labels, inertia = _labels_inertia(X, sample_weight,
                                       x_squared_norms, centers,
                                       n_threads=n_threads)
@@ -1493,8 +1495,8 @@ class MiniBatchKMeans(KMeans):
         Control the fraction of the maximum number of counts for a center to
         be reassigned. A higher value means that low count centers are more
         easily reassigned, which means that the model will take longer to
-        converge, but should converge in a better clustering. A too high value
-        may however cause convergence issues, especially with a small batch
+        converge, but should converge in a better clustering. However, too high
+        a value may cause convergence issues, especially with a small batch
         size.
 
     Attributes
@@ -1503,7 +1505,7 @@ class MiniBatchKMeans(KMeans):
     cluster_centers_ : ndarray of shape (n_clusters, n_features)
         Coordinates of cluster centers.
 
-    labels_ : ndarray of shape (n_samples)
+    labels_ : ndarray of shape (n_samples,)
         Labels of each point (if compute_labels is set to True).
 
     inertia_ : float
@@ -1869,8 +1871,11 @@ def partial_fit(self, X, y=None, sample_weight=None):
         Parameters
         ----------
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Coordinates of the data points to cluster. It must be noted that
-            X will be copied if it is not C-contiguous.
+            Training instances to cluster. It must be noted that the data
+            will be converted to C ordering, which will cause a memory copy
+            if the given data is not C-contiguous.
+            If a sparse matrix is passed, a copy will be made if it's not in
+            CSR format.
 
         y : Ignored
             Not used, present here for API consistency by convention.
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index a56c2d8e55d8e..248b2e1ddd498 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -999,9 +999,9 @@ def test_inertia(dtype):
     expected = np.sum(distances * sample_weight)
 
     inertia_dense = _inertia_dense(
-        X_dense, sample_weight, centers, labels, 1)
+        X_dense, sample_weight, centers, labels, n_threads=1)
     inertia_sparse = _inertia_sparse(
-        X_sparse, sample_weight, centers, labels, 1)
+        X_sparse, sample_weight, centers, labels, n_threads=1)
 
     assert_allclose(inertia_dense, inertia_sparse, rtol=1e-6)
     assert_allclose(inertia_dense, expected, rtol=1e-6)

From 5ff60c8024e2b2542047e725879b003702a5d7b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Wed, 7 Apr 2021 19:39:02 +0200
Subject: [PATCH 72/72] Update sklearn/cluster/_kmeans.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 sklearn/cluster/_kmeans.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index e9f952b58cb1b..44c2837a8802a 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -1357,7 +1357,7 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new,
     """
     # Perform label assignment to nearest centers
     # For better efficiency, it's better to run _mini_batch_step in a
-    # threadpool_limit context then using _labels_inertia_threadpool_limit here
+    # threadpool_limit context than using _labels_inertia_threadpool_limit here
     labels, inertia = _labels_inertia(X, sample_weight,
                                       x_squared_norms, centers,
                                       n_threads=n_threads)