From e19921450fe0fe0c77dcc9baf9b8e9fc8923e9e6 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 24 Feb 2020 18:15:16 +0100 Subject: [PATCH 01/72] refactoring --- sklearn/cluster/_kmeans.py | 595 ++++++++++++-------------- sklearn/cluster/tests/test_k_means.py | 6 +- 2 files changed, 287 insertions(+), 314 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 7e4df5908137b..c36acf122445e 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -22,7 +22,6 @@ from ..utils.extmath import row_norms, stable_cumsum from ..utils.sparsefuncs_fast import assign_rows_csr from ..utils.sparsefuncs import mean_variance_axis -from ..utils.validation import _num_samples from ..utils import check_array from ..utils import gen_batches from ..utils import check_random_state @@ -43,8 +42,8 @@ ############################################################################### # Initialization heuristic - -def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): +def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, + n_local_trials=None): """Init n_clusters seeds according to k-means++ Parameters @@ -83,8 +82,6 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): centers = np.empty((n_clusters, n_features), dtype=X.dtype) - assert x_squared_norms is not None, 'x_squared_norms None in _k_init' - # Set the number of local seeding trials if none is given if n_local_trials is None: # This is what Arthur/Vassilvitskii tried, but did not report @@ -143,30 +140,6 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): ############################################################################### # K-means batch estimation by EM (expectation maximization) -def _validate_center_shape(X, n_centers, centers): - """Check if centers is compatible with X and n_centers""" - if len(centers) != n_centers: - raise ValueError('The shape of the initial centers (%s) ' - 'does not match the number of clusters %i' - % (centers.shape, n_centers)) - if centers.shape[1] != X.shape[1]: - raise ValueError( - "The number of features of the initial centers %s " - "does not match the number of features of the data %s." - % (centers.shape[1], X.shape[1])) - - -def _tolerance(X, tol): - """Return a tolerance which is independent of the dataset""" - if tol == 0: - return 0 - if sp.issparse(X): - variances = mean_variance_axis(X, axis=0)[1] - else: - variances = np.var(X, axis=0) - return np.mean(variances) * tol - - def _check_normalize_sample_weight(sample_weight, X): """Set sample_weight if None, and check for correct dtype""" @@ -322,8 +295,8 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', return est.cluster_centers_, est.labels_, est.inertia_ -def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300, - init='k-means++', verbose=False, x_squared_norms=None, +def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300, + verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, n_threads=1): """A single run of k-means lloyd, assumes preparation completed prior. @@ -335,29 +308,12 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300, sample_weight : array-like of shape (n_samples,) The weights for each observation in X. - n_clusters : int - The number of clusters to form as well as the number of - centroids to generate. + centers_init : ndarray of shape (n_clusters, n_features) + The initial centers. max_iter : int, default=300 Maximum number of iterations of the k-means algorithm to run. - init : {'k-means++', 'random', ndarray, callable}, default='k-means++' - Method for initialization: - - 'k-means++' : selects initial cluster centers for k-mean - clustering in a smart way to speed up convergence. See section - Notes in k_init for more details. - - 'random': choose `n_clusters` observations (rows) at random from data - for the initial centroids. - - If an ndarray is passed, it should be of shape (n_clusters, n_features) - and gives the initial centers. - - If a callable is passed, it should take arguments X, n_clusters and a - random state and return an initialization. - verbose : bool, default=False Verbosity mode @@ -398,17 +354,12 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300, Number of iterations run. """ random_state = check_random_state(random_state) - sample_weight = _check_normalize_sample_weight(sample_weight, X) - - # init - centers = _init_centroids(X, n_clusters, init, random_state=random_state, - x_squared_norms=x_squared_norms) - - if verbose: - print('Initialization complete') n_samples = X.shape[0] + n_clusters = centers_init.shape[0] + # Buffers to avoid new allocations at each iteration. + centers = centers_init centers_new = np.zeros_like(centers) weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype) labels = np.full(n_samples, -1, dtype=np.int32) @@ -444,18 +395,17 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300, if verbose: inertia = _inertia(X, sample_weight, centers, labels) - print("Iteration {0}, inertia {1}" .format(i, inertia)) + print(f"Iteration {i}, inertia {inertia}") + + centers, centers_new = centers_new, centers center_shift_tot = (center_shift**2).sum() if center_shift_tot <= tol: if verbose: - print("Converged at iteration {0}: " - "center shift {1} within tolerance {2}" - .format(i, center_shift_tot, tol)) + print(f"Converged at iteration {i}: center shift " + f"{center_shift_tot} within tolerance {tol}.") break - centers, centers_new = centers_new, centers - if center_shift_tot > 0: # rerun E-step so that predicted labels match cluster centers elkan_iter(X, sample_weight, centers, centers, weight_in_clusters, @@ -468,8 +418,8 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300, return labels, inertia, centers, i + 1 -def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, - init='k-means++', verbose=False, x_squared_norms=None, +def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300, + verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, n_threads=1): """A single run of k-means lloyd, assumes preparation completed prior. @@ -481,29 +431,12 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, sample_weight : ndarray of shape (n_samples,) The weights for each observation in X. - n_clusters : int - The number of clusters to form as well as the number of - centroids to generate. + centers_init : ndarray of shape (n_clusters, n_features) + The initial centers. max_iter : int, default=300 Maximum number of iterations of the k-means algorithm to run. - init : {'k-means++', 'random', ndarray, callable}, default='k-means++' - Method for initialization: - - 'k-means++' : selects initial cluster centers for k-mean - clustering in a smart way to speed up convergence. See section - Notes in k_init for more details. - - 'random': choose `n_clusters` observations (rows) at random from data - for the initial centroids. - - If an ndarray is passed, it should be of shape (n_clusters, n_features) - and gives the initial centers. - - If a callable is passed, it should take arguments X, n_clusters and a - random state and return an initialization. - verbose : bool, default=False Verbosity mode @@ -544,15 +477,11 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, Number of iterations run. """ random_state = check_random_state(random_state) - sample_weight = _check_normalize_sample_weight(sample_weight, X) - # init - centers = _init_centroids(X, n_clusters, init, random_state=random_state, - x_squared_norms=x_squared_norms) - - if verbose: - print("Initialization complete") + n_clusters = centers_init.shape[0] + # Buffers to avoid new allocations at each iteration. + centers = centers_init centers_new = np.zeros_like(centers) labels = np.full(X.shape[0], -1, dtype=np.int32) weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype) @@ -571,18 +500,17 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, if verbose: inertia = _inertia(X, sample_weight, centers, labels) - print("Iteration {0}, inertia {1}" .format(i, inertia)) + print(f"Iteration {i}, inertia {inertia}.") + + centers, centers_new = centers_new, centers center_shift_tot = (center_shift**2).sum() if center_shift_tot <= tol: if verbose: - print("Converged at iteration {0}: " - "center shift {1} within tolerance {2}" - .format(i, center_shift_tot, tol)) + print(f"Converged at iteration {i}: center shift " + f"{center_shift_tot} within tolerance {tol}.") break - centers, centers_new = centers_new, centers - if center_shift_tot > 0: # rerun E-step so that predicted labels match cluster centers lloyd_iter(X, sample_weight, x_squared_norms, centers, centers, @@ -594,28 +522,29 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, return labels, inertia, centers, i + 1 -def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1): +def _labels_inertia(X, sample_weight, x_squared_norms, centers, + n_threads=None): """E step of the K-means EM algorithm. Compute the labels and the inertia of the given samples and centers. Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The input samples to assign to the labels. If sparse matrix, must be in - CSR format. + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The input samples to assign to the labels. If sparse matrix, must + be in CSR format. - sample_weight : array-like of shape (n_samples,) + sample_weight : ndarray of shape (n_samples,) The weights for each observation in X. x_squared_norms : ndarray of shape (n_samples,) Precomputed squared euclidean norm of each data point, to speed up computations. - centers : ndarray, shape (n_clusters, n_features) + centers : ndarray of shape (n_clusters, n_features) The cluster centers. - n_threads : int, default=1 + n_threads : int, default=None The number of OpenMP threads to use for the computation. Parallelism is sample-wise on the main cython loop which assigns each sample to its closest center. @@ -626,12 +555,13 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1): The resulting assignment inertia : float - Sum of squared distances of samples to their closest cluster center. + Sum of squared distances of samples to their closest cluster center """ n_samples = X.shape[0] n_clusters = centers.shape[0] - sample_weight = _check_normalize_sample_weight(sample_weight, X) + n_threads = _openmp_effective_n_threads(n_threads) + labels = np.full(n_samples, -1, dtype=np.int32) weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype) center_shift = np.zeros_like(weight_in_clusters) @@ -652,88 +582,6 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1): return labels, inertia -def _init_centroids(X, n_clusters=8, init="k-means++", random_state=None, - x_squared_norms=None, init_size=None): - """Compute the initial centroids - - Parameters - ---------- - - X : {ndarray, spare matrix} of shape (n_samples, n_features) - The input samples. - - n_clusters : int, default=8 - number of centroids. - - init : {'k-means++', 'random', ndarray, callable}, default="k-means++" - Method for initialization. - - random_state : int, RandomState instance, default=None - Determines random number generation for centroid initialization. Use - an int to make the randomness deterministic. - See :term:`Glossary `. - - x_squared_norms : ndarray of shape (n_samples,), default=None - Squared euclidean norm of each data point. Pass it if you have it at - hands already to avoid it being recomputed here. Default: None - - init_size : int, default=None - Number of samples to randomly sample for speeding up the - initialization (sometimes at the expense of accuracy): the - only algorithm is initialized by running a batch KMeans on a - random subset of the data. This needs to be larger than k. - - Returns - ------- - centers : array of shape(k, n_features) - """ - random_state = check_random_state(random_state) - n_samples = X.shape[0] - - if x_squared_norms is None: - x_squared_norms = row_norms(X, squared=True) - - if init_size is not None and init_size < n_samples: - if init_size < n_clusters: - warnings.warn( - "init_size=%d should be larger than k=%d. " - "Setting it to 3*k" % (init_size, n_clusters), - RuntimeWarning, stacklevel=2) - init_size = 3 * n_clusters - init_indices = random_state.randint(0, n_samples, init_size) - X = X[init_indices] - x_squared_norms = x_squared_norms[init_indices] - n_samples = X.shape[0] - elif n_samples < n_clusters: - raise ValueError( - "n_samples={} should be larger than n_clusters={}" - .format(n_samples, n_clusters)) - - if isinstance(init, str) and init == 'k-means++': - centers = _k_init(X, n_clusters, random_state=random_state, - x_squared_norms=x_squared_norms) - elif isinstance(init, str) and init == 'random': - seeds = random_state.permutation(n_samples)[:n_clusters] - centers = X[seeds] - elif hasattr(init, '__array__'): - # ensure that the centers have the same dtype as X - # this is a requirement of fused types of cython - centers = np.array(init, dtype=X.dtype) - elif callable(init): - centers = init(X, n_clusters, random_state=random_state) - centers = np.asarray(centers, dtype=X.dtype) - else: - raise ValueError("the init parameter for the k-means should " - "be 'k-means++' or 'random' or an ndarray, " - "'%s' (type '%s') was passed." % (init, type(init))) - - if sp.issparse(centers): - centers = centers.toarray() - - _validate_center_shape(X, n_clusters, centers) - return centers - - class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): """K-Means clustering. @@ -913,18 +761,162 @@ def __init__(self, n_clusters=8, init='k-means++', n_init=10, self.n_jobs = n_jobs self.algorithm = algorithm + def _check_params(self, X): + if self.precompute_distances != 'deprecated': + warnings.warn("'precompute_distances' was deprecated in version " + "0.23 and will be removed in 0.25. It has no " + "effect", FutureWarning) + + if self.n_jobs != 'deprecated': + warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" + " removed in 0.25.", FutureWarning) + self._n_threads = self.n_jobs + else: + self._n_threads = None + self._n_threads = _openmp_effective_n_threads(self._n_threads) + + if self.n_init <= 0: + raise ValueError(f"Invalid number of initializations. n_init=" + f"{self.n_init} must be bigger than zero.") + self._n_init = self.n_init + + if self.max_iter <= 0: + raise ValueError(f"Number of iterations should be a positive " + f"number, got {self.max_iter} instead.") + + if X.shape[0] < self.n_clusters: + raise ValueError(f"n_samples={X.shape[0]} should be >= " + f"n_clusters={self.n_clusters}.") + + if self.tol < 0: + raise ValueError(f"tol={self.tol} should be >= 0.") + self._tol = self._normalize_tolerance(X, self.tol) + + if self.algorithm not in ("auto", "full", "elkan"): + raise ValueError(f"Algorithm must be 'auto', 'full' or 'elkan', " + f"got {self.algorithm}.") + + self._algorithm = self.algorithm + if self._algorithm == "elkan" and self.n_clusters == 1: + warnings.warn("algorithm='elkan' doesn't make sense for a single " + "cluster. Using 'full' instead.", RuntimeWarning) + self._algorithm = "full" + if self._algorithm == "auto": + self._algorithm = "full" if self.n_clusters == 1 else "elkan" + + if hasattr(self.init, '__array__'): + self._validate_center_shape(X, self.init) + if self._n_init != 1: + warnings.warn( + f"Explicit initial center position passed: performing only" + f"one init in {self.__class__.__name__} instead of " + f"n_init={self._n_init}.", RuntimeWarning, stacklevel=2) + self._n_init = 1 + + def _validate_center_shape(self, X, centers): + """Check if centers is compatible with X and n_clusters""" + if centers.shape[0] != self.n_clusters: + raise ValueError( + f"The shape of the initial centers {centers.shape} does not " + f"match the number of clusters {self.n_clusters}.") + if centers.shape[1] != X.shape[1]: + raise ValueError( + f"The shape of the initial centers {centers.shape} does not " + f"match the number of features of the data {X.shape[1]}.") + def _check_test_data(self, X): X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], order='C', accept_large_sparse=False) n_samples, n_features = X.shape expected_n_features = self.cluster_centers_.shape[1] if not n_features == expected_n_features: - raise ValueError("Incorrect number of features. " - "Got %d features, expected %d" % ( - n_features, expected_n_features)) + raise ValueError( + f"Incorrect number of features. Got {n_features} features, " + f"expected {expected_n_features}.") return X + def _normalize_tolerance(self, X, tol): + """Return a tolerance which is independent of the dataset""" + if tol == 0: + return 0 + if sp.issparse(X): + variances = mean_variance_axis(X, axis=0)[1] + else: + variances = np.var(X, axis=0) + return np.mean(variances) * tol + + def _init_centroids(self, X, x_squared_norms, init, random_state, + init_size=None): + """Compute the initial centroids + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The input samples. + + x_squared_norms : ndarray of shape (n_samples,) + Squared euclidean norm of each data point. Pass it if you have it + at hands already to avoid it being recomputed here. + + init : {'k-means++', 'random', ndarray, callable} + Method for initialization. + + random_state : RandomState instance + Determines random number generation for centroid initialization. + See :term:`Glossary `. + + init_size : int, default=None + Number of samples to randomly sample for speeding up the + initialization (sometimes at the expense of accuracy): the only + algorithm is initialized by running a batch KMeans on a random + subset of the data. This needs to be larger than k. + TODO: Reword because does not mean anything + + Returns + ------- + centers : ndarray of shape(n_clusters, n_features) + """ + n_samples = X.shape[0] + n_clusters = self.n_clusters + + if init_size is not None and init_size < n_samples: + if init_size < n_clusters: + warnings.warn( + f"init_size={init_size} should be larger than " + f"n_clusters={n_clusters}. Setting it to 3*n_clusters", + RuntimeWarning, stacklevel=2) + init_size = 3 * n_clusters + init_indices = random_state.randint(0, n_samples, init_size) + X = X[init_indices] + x_squared_norms = x_squared_norms[init_indices] + n_samples = X.shape[0] + + if isinstance(init, str) and init == 'k-means++': + centers = _kmeans_plusplus(X, n_clusters, + random_state=random_state, + x_squared_norms=x_squared_norms) + elif isinstance(init, str) and init == 'random': + seeds = random_state.permutation(n_samples)[:n_clusters] + centers = X[seeds] + elif hasattr(init, '__array__'): + centers = init + elif callable(init): + centers = init(X, n_clusters, random_state=random_state) + centers = check_array( + centers, dtype=X.dtype, copy=False, order='C') + self._validate_center_shape(X, centers) + else: + raise ValueError( + f"the init parameter for {self.__class__.__name__} should be " + f"'k-means++', 'random', a ndarray or a callable. '{init}'" + f" (type '{type(self.init)}') was passed.") + + if sp.issparse(centers): + centers = centers.toarray() + + return centers + def fit(self, X, y=None, sample_weight=None): """Compute k-means clustering. @@ -949,96 +941,57 @@ def fit(self, X, y=None, sample_weight=None): self Fitted estimator. """ - random_state = check_random_state(self.random_state) - - if self.precompute_distances != 'deprecated': - warnings.warn("'precompute_distances' was deprecated in version " - "0.23 and will be removed in 0.25. It has no " - "effect", FutureWarning) - - if self.n_jobs != 'deprecated': - warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" - " removed in 0.25.", FutureWarning) - self._n_threads = self.n_jobs - else: - self._n_threads = None - self._n_threads = _openmp_effective_n_threads(self._n_threads) - - n_init = self.n_init - if n_init <= 0: - raise ValueError("Invalid number of initializations." - " n_init=%d must be bigger than zero." % n_init) - - if self.max_iter <= 0: - raise ValueError( - 'Number of iterations should be a positive number,' - ' got %d instead' % self.max_iter - ) - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], order='C', copy=self.copy_x, accept_large_sparse=False) - # verify that the number of samples given is larger than k - if _num_samples(X) < self.n_clusters: - raise ValueError("n_samples=%d should be >= n_clusters=%d" % ( - _num_samples(X), self.n_clusters)) - tol = _tolerance(X, self.tol) + sample_weight = _check_normalize_sample_weight(sample_weight, X) + + random_state = check_random_state(self.random_state) # Validate init array init = self.init if hasattr(init, '__array__'): - init = check_array(init, dtype=X.dtype.type, copy=True, order='C') - _validate_center_shape(X, self.n_clusters, init) + init = check_array(init, dtype=X.dtype, copy=True, order='C') - if n_init != 1: - warnings.warn( - 'Explicit initial center position passed: ' - 'performing only one init in k-means instead of n_init=%d' - % n_init, RuntimeWarning, stacklevel=2) - n_init = 1 + self._check_params(X) - # subtract of mean of x for more accurate distance computations + # subtract mean of X for more accurate distance computations if not sp.issparse(X): X_mean = X.mean(axis=0) # The copy was already done above X -= X_mean - if hasattr(init, '__array__'): + if hasattr(self.init, '__array__'): init -= X_mean # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) - best_labels, best_inertia, best_centers = None, None, None - - algorithm = self.algorithm - if algorithm == "elkan" and self.n_clusters == 1: - warnings.warn("algorithm='elkan' doesn't make sense for a single " - "cluster. Using 'full' instead.", RuntimeWarning) - algorithm = "full" - - if algorithm == "auto": - algorithm = "full" if self.n_clusters == 1 else "elkan" - - if algorithm == "full": + if self._algorithm == "full": kmeans_single = _kmeans_single_lloyd - elif algorithm == "elkan": - kmeans_single = _kmeans_single_elkan else: - raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got" - " {}".format(str(algorithm))) + kmeans_single = _kmeans_single_elkan # seeds for the initializations of the kmeans runs. - seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) + seeds = random_state.randint(np.iinfo(np.int32).max, size=self._n_init) + + best_labels, best_inertia, best_centers = None, None, None # limit number of threads in second level of nested parallelism # (i.e. BLAS) to avoid oversubsciption. with threadpool_limits(limits=1, user_api="blas"): for seed in seeds: + # Initialize centers + centers_init = self._init_centroids( + X, x_squared_norms=x_squared_norms, init=init, + random_state=random_state) + if self.verbose: + print("Initialization complete") + # run a k-means once labels, inertia, centers, n_iter_ = kmeans_single( - X, sample_weight, self.n_clusters, max_iter=self.max_iter, - init=init, verbose=self.verbose, tol=tol, + X, sample_weight, centers_init, max_iter=self.max_iter, + verbose=self.verbose, tol=self._tol, x_squared_norms=x_squared_norms, random_state=seed, n_threads=self._n_threads) # determine if these results are the best so far @@ -1114,10 +1067,6 @@ def fit_transform(self, X, y=None, sample_weight=None): X_new : array of shape (n_samples, n_clusters) X transformed in the new space. """ - # Currently, this just skips a copy of the data if it is not in - # np.array or CSR format already. - # XXX This skips _check_test_data, which may change the dtype; - # we should refactor the input validation. return self.fit(X, sample_weight=sample_weight)._transform(X) def transform(self, X): @@ -1171,6 +1120,7 @@ def predict(self, X, sample_weight=None): X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) + sample_weight = _check_normalize_sample_weight(sample_weight, X) return _labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_, self._n_threads)[0] @@ -1199,9 +1149,10 @@ def score(self, X, y=None, sample_weight=None): X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) + sample_weight = _check_normalize_sample_weight(sample_weight, X) return -_labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_)[1] + self.cluster_centers_, self._n_threads)[1] def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, @@ -1569,6 +1520,31 @@ def __init__(self, n_clusters=8, init='k-means++', max_iter=100, self.init_size = init_size self.reassignment_ratio = reassignment_ratio + def _check_params(self, X): + super()._check_params(X) + + if self.max_no_improvement is not None and self.max_no_improvement < 0: + raise ValueError( + f"max_no_improvement should be >= 0, got " + f"{self.max_no_improvement} instead.") + + if self.batch_size <= 0: + raise ValueError( + f"batch_size should be > 0, got {self.batch_size} instead.") + + if self.init_size is not None and self.init_size <= 0: + raise ValueError( + f"init_size should be > 0, got {self.init_size} instead.") + self._init_size = self.init_size + if self._init_size is None: + self._init_size = 3 * self.batch_size + self._init_size = min(self._init_size, X.shape[0]) + + if self.reassignment_ratio < 0: + raise ValueError( + f"reassignment_ratio should be >= 0, got " + f"{self.reassignment_ratio} instead.") + def fit(self, X, y=None, sample_weight=None): """Compute the centroids on X by chunking it into mini-batches. @@ -1590,38 +1566,31 @@ def fit(self, X, y=None, sample_weight=None): ------- self """ - random_state = check_random_state(self.random_state) - X = check_array(X, accept_sparse="csr", order='C', - dtype=[np.float64, np.float32]) + # TODO accept_large_sparse ??? + X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], + order='C') n_samples, n_features = X.shape - if n_samples < self.n_clusters: - raise ValueError("n_samples=%d should be >= n_clusters=%d" - % (n_samples, self.n_clusters)) sample_weight = _check_normalize_sample_weight(sample_weight, X) - n_init = self.n_init - if hasattr(self.init, '__array__'): - self.init = np.ascontiguousarray(self.init, dtype=X.dtype) - if n_init != 1: - warnings.warn( - 'Explicit initial center position passed: ' - 'performing only one init in MiniBatchKMeans instead of ' - 'n_init=%d' - % self.n_init, RuntimeWarning, stacklevel=2) - n_init = 1 + random_state = check_random_state(self.random_state) - x_squared_norms = row_norms(X, squared=True) + # Validate init array + init = self.init + if hasattr(init, '__array__'): + init = check_array(init, dtype=X.dtype, copy=True, order='C') + + self._check_params(X) - if self.tol > 0.0: - tol = _tolerance(X, self.tol) + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + if self._tol > 0.0: # using tol-based early stopping needs the allocation of a # dedicated before which can be expensive for high dim data: # hence we allocate it outside of the main loop old_center_buffer = np.zeros(n_features, dtype=X.dtype) else: - tol = 0.0 # no need for the center buffer if tol-based early stopping is # disabled old_center_buffer = np.zeros(0, dtype=X.dtype) @@ -1630,24 +1599,18 @@ def fit(self, X, y=None, sample_weight=None): n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) - init_size = self.init_size - if init_size is None: - init_size = 3 * self.batch_size - if init_size > n_samples: - init_size = n_samples - self.init_size_ = init_size - - validation_indices = random_state.randint(0, n_samples, init_size) + validation_indices = random_state.randint(0, n_samples, + self._init_size) X_valid = X[validation_indices] sample_weight_valid = sample_weight[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] # perform several inits with random sub-sets best_inertia = None - for init_idx in range(n_init): + for init_idx in range(self._n_init): if self.verbose: print("Init %d/%d with method: %s" - % (init_idx + 1, n_init, self.init)) + % (init_idx + 1, self._n_init, self.init)) weight_sums = np.zeros(self.n_clusters, dtype=sample_weight.dtype) # TODO: once the `k_means` function works with sparse input we @@ -1655,11 +1618,9 @@ def fit(self, X, y=None, sample_weight=None): # Initialize the centers using only a fraction of the data as we # expect n_samples to be very large when using MiniBatchKMeans - cluster_centers = _init_centroids( - X, self.n_clusters, self.init, - random_state=random_state, - x_squared_norms=x_squared_norms, - init_size=init_size) + cluster_centers = self._init_centroids( + X, x_squared_norms=x_squared_norms, init=self.init, + random_state=random_state, init_size=self._init_size) # Compute the label assignment on the init dataset _mini_batch_step( @@ -1675,7 +1636,7 @@ def fit(self, X, y=None, sample_weight=None): cluster_centers) if self.verbose: print("Inertia for init %d/%d: %f" - % (init_idx + 1, n_init, inertia)) + % (init_idx + 1, self._n_init, inertia)) if best_inertia is None or inertia < best_inertia: self.cluster_centers_ = cluster_centers self.counts_ = weight_sums @@ -1696,7 +1657,7 @@ def fit(self, X, y=None, sample_weight=None): X[minibatch_indices], sample_weight[minibatch_indices], x_squared_norms[minibatch_indices], self.cluster_centers_, self.counts_, - old_center_buffer, tol > 0.0, distances=distances, + old_center_buffer, self._tol > 0.0, distances=distances, # Here we randomly choose whether to perform # random reassignment: the choice is done as a function # of the iteration index, and the minimum number of @@ -1710,7 +1671,7 @@ def fit(self, X, y=None, sample_weight=None): # Monitor convergence and do early stopping if necessary if _mini_batch_convergence( - self, iteration_idx, n_iter, tol, n_samples, + self, iteration_idx, n_iter, self._tol, n_samples, centers_squared_diff, batch_inertia, convergence_context, verbose=self.verbose): break @@ -1719,11 +1680,13 @@ def fit(self, X, y=None, sample_weight=None): if self.compute_labels: self.labels_, self.inertia_ = \ - self._labels_inertia_minibatch(X, sample_weight) + self._labels_inertia_minibatch( + X, sample_weight, x_squared_norms, self.cluster_centers_) return self - def _labels_inertia_minibatch(self, X, sample_weight): + def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms, + centers): """Compute labels and inertia using mini batches. This is slightly slower than doing everything at once but preventes @@ -1731,15 +1694,22 @@ def _labels_inertia_minibatch(self, X, sample_weight): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) Input data. - sample_weight : array-like, shape (n_samples,) + sample_weight : ndarray of shape (n_samples,) The weights for each observation in X. + x_squared_norms : ndarray of shape (n_samples,) + Precomputed squared euclidean norm of each data point, to speed up + computations. + + centers : ndarray of shape (n_clusters, n_features) + The cluster centers. + Returns ------- - labels : array, shape (n_samples,) + labels : ndarray, shape (n_samples,) Cluster labels for each point. inertia : float @@ -1747,11 +1717,9 @@ def _labels_inertia_minibatch(self, X, sample_weight): """ if self.verbose: print('Computing label assignment and total inertia') - sample_weight = _check_normalize_sample_weight(sample_weight, X) - x_squared_norms = row_norms(X, squared=True) slices = gen_batches(X.shape[0], self.batch_size) results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s], - self.cluster_centers_) for s in slices] + centers) for s in slices] labels, inertia = zip(*results) return np.hstack(labels), np.sum(inertia) @@ -1788,16 +1756,17 @@ def partial_fit(self, X, y=None, sample_weight=None): sample_weight = _check_normalize_sample_weight(sample_weight, X) x_squared_norms = row_norms(X, squared=True) - self.random_state_ = getattr(self, "random_state_", + self._random_state = getattr(self, "_random_state", check_random_state(self.random_state)) if (not hasattr(self, 'counts_') or not hasattr(self, 'cluster_centers_')): # this is the first call partial_fit on this object: # initialize the cluster centers - self.cluster_centers_ = _init_centroids( - X, self.n_clusters, self.init, - random_state=self.random_state_, - x_squared_norms=x_squared_norms, init_size=self.init_size) + self.cluster_centers_ = self._init_centroids( + X, x_squared_norms=x_squared_norms, init=self.init, + random_state=self._random_state, init_size=self.init_size) + # TODO: should be self._init_size + # Should check params before self.counts_ = np.zeros(self.n_clusters, dtype=sample_weight.dtype) @@ -1807,7 +1776,7 @@ def partial_fit(self, X, y=None, sample_weight=None): # The lower the minimum count is, the more we do random # reassignment, however, we don't want to do random # reassignment too often, to allow for building up counts - random_reassign = self.random_state_.randint( + random_reassign = self._random_state.randint( 10 * (1 + self.counts_.min())) == 0 distances = np.zeros(X.shape[0], dtype=X.dtype) @@ -1822,7 +1791,7 @@ def partial_fit(self, X, y=None, sample_weight=None): self.cluster_centers_, self.counts_, np.zeros(0, dtype=X.dtype), 0, random_reassign=random_reassign, distances=distances, - random_state=self.random_state_, + random_state=self._random_state, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) @@ -1856,4 +1825,8 @@ def predict(self, X, sample_weight=None): check_is_fitted(self) X = self._check_test_data(X) - return self._labels_inertia_minibatch(X, sample_weight)[0] + x_squared_norms = row_norms(X, squared=True) + sample_weight = _check_normalize_sample_weight(sample_weight, X) + + return self._labels_inertia_minibatch( + X, sample_weight, x_squared_norms, self.cluster_centers_)[0] diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 2bcbc3faa517f..bf23d669da654 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -209,7 +209,7 @@ def test_labels_assignment_and_inertia(): assert (mindist >= 0.0).all() assert (labels_gold != -1).all() - sample_weight = None + sample_weight = np.ones(X.shape[0], dtype=X.dtype) # perform label assignment using the dense array input x_squared_norms = (X ** 2).sum(axis=1) @@ -599,7 +599,7 @@ def test_minibatch_default_init_size(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, batch_size=10, random_state=42, n_init=1).fit(X) - assert mb_k_means.init_size_ == 3 * mb_k_means.batch_size + assert mb_k_means._init_size == 3 * mb_k_means.batch_size _check_fitted_model(mb_k_means) @@ -614,7 +614,7 @@ def test_minibatch_set_init_size(): init_size=666, random_state=42, n_init=1).fit(X) assert mb_k_means.init_size == 666 - assert mb_k_means.init_size_ == n_samples + assert mb_k_means._init_size == n_samples _check_fitted_model(mb_k_means) From 7f85bcaab20e8883256cca998c6b1afb544333c9 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 27 Feb 2020 10:25:29 +0100 Subject: [PATCH 02/72] wip --- sklearn/cluster/_kmeans.py | 93 +- sklearn/cluster/tests/test_k_means.py | 1470 +++++++++--------------- sklearn/cluster/tests/test_k_means2.py | 190 +++ 3 files changed, 787 insertions(+), 966 deletions(-) create mode 100644 sklearn/cluster/tests/test_k_means2.py diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index c36acf122445e..ad9e7eab1ea2c 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -776,25 +776,25 @@ def _check_params(self, X): self._n_threads = _openmp_effective_n_threads(self._n_threads) if self.n_init <= 0: - raise ValueError(f"Invalid number of initializations. n_init=" - f"{self.n_init} must be bigger than zero.") + raise ValueError( + f"n_init should be > 0, got {self.n_init} instead.") self._n_init = self.n_init if self.max_iter <= 0: - raise ValueError(f"Number of iterations should be a positive " - f"number, got {self.max_iter} instead.") + raise ValueError( + f"max_iter should be > 0, got {self.max_iter} instead.") if X.shape[0] < self.n_clusters: raise ValueError(f"n_samples={X.shape[0]} should be >= " f"n_clusters={self.n_clusters}.") if self.tol < 0: - raise ValueError(f"tol={self.tol} should be >= 0.") + raise ValueError(f"tol should be >= 0, got {self.tol} instead.") self._tol = self._normalize_tolerance(X, self.tol) if self.algorithm not in ("auto", "full", "elkan"): raise ValueError(f"Algorithm must be 'auto', 'full' or 'elkan', " - f"got {self.algorithm}.") + f"got {self.algorithm} instead.") self._algorithm = self.algorithm if self._algorithm == "elkan" and self.n_clusters == 1: @@ -804,12 +804,19 @@ def _check_params(self, X): if self._algorithm == "auto": self._algorithm = "full" if self.n_clusters == 1 else "elkan" + if not (hasattr(self.init, '__array__') or callable(self.init) + or (isinstance(self.init, str) + and self.init in ["k-means++", "random"])): + raise ValueError( + f"init should be either 'k-means++', 'random', a ndarray or a " + f"callable, got '{self.init}' instead.") + if hasattr(self.init, '__array__'): self._validate_center_shape(X, self.init) if self._n_init != 1: warnings.warn( f"Explicit initial center position passed: performing only" - f"one init in {self.__class__.__name__} instead of " + f" one init in {self.__class__.__name__} instead of " f"n_init={self._n_init}.", RuntimeWarning, stacklevel=2) self._n_init = 1 @@ -881,12 +888,6 @@ def _init_centroids(self, X, x_squared_norms, init, random_state, n_clusters = self.n_clusters if init_size is not None and init_size < n_samples: - if init_size < n_clusters: - warnings.warn( - f"init_size={init_size} should be larger than " - f"n_clusters={n_clusters}. Setting it to 3*n_clusters", - RuntimeWarning, stacklevel=2) - init_size = 3 * n_clusters init_indices = random_state.randint(0, n_samples, init_size) X = X[init_indices] x_squared_norms = x_squared_norms[init_indices] @@ -906,11 +907,6 @@ def _init_centroids(self, X, x_squared_norms, init, random_state, centers = check_array( centers, dtype=X.dtype, copy=False, order='C') self._validate_center_shape(X, centers) - else: - raise ValueError( - f"the init parameter for {self.__class__.__name__} should be " - f"'k-means++', 'random', a ndarray or a callable. '{init}'" - f" (type '{type(self.init)}') was passed.") if sp.issparse(centers): centers = centers.toarray() @@ -1377,20 +1373,22 @@ class MiniBatchKMeans(KMeans): The number of clusters to form as well as the number of centroids to generate. - init : {'k-means++', 'random'} or ndarray of shape \ - (n_clusters, n_features), default='k-means++' - Method for initialization + init : {'k-means++', 'random', ndarray, callable}, default='k-means++' + Method for initialization: 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. - 'random': choose k observations (rows) at random from data for - the initial centroids. + 'random': choose `n_clusters` observations (rows) at random from data + for the initial centroids. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. + If a callable is passed, it should take arguments X, n_clusters and a + random state and return an initialization. + max_iter : int, default=100 Maximum number of iterations over the complete dataset before stopping independently of any early stopping criterion heuristics. @@ -1454,7 +1452,7 @@ class MiniBatchKMeans(KMeans): cluster_centers_ : ndarray of shape (n_clusters, n_features) Coordinates of cluster centers - labels_ : int + labels_ : ndarray of shape (n_samples) Labels of each point (if compute_labels is set to True). inertia_ : float @@ -1538,6 +1536,14 @@ def _check_params(self, X): self._init_size = self.init_size if self._init_size is None: self._init_size = 3 * self.batch_size + if self._init_size < self.n_clusters: + self._init_size = 3 * self.n_clusters + elif self._init_size < self.n_clusters: + warnings.warn( + f"init_size={self._init_size} should be larger than " + f"n_clusters={self.n_clusters}. Setting it to 3*n_clusters", + RuntimeWarning, stacklevel=2) + self._init_size = 3 * self.n_clusters self._init_size = min(self._init_size, X.shape[0]) if self.reassignment_ratio < 0: @@ -1550,7 +1556,7 @@ def fit(self, X, y=None, sample_weight=None): Parameters ---------- - X : array-like or sparse matrix, shape=(n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. @@ -1558,9 +1564,9 @@ def fit(self, X, y=None, sample_weight=None): y : Ignored Not used, present here for API consistency by convention. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None). + are assigned equal weight. Returns ------- @@ -1609,8 +1615,8 @@ def fit(self, X, y=None, sample_weight=None): best_inertia = None for init_idx in range(self._n_init): if self.verbose: - print("Init %d/%d with method: %s" - % (init_idx + 1, self._n_init, self.init)) + print(f"Init {init_idx + 1}/{self._n_init} with method {init}") + weight_sums = np.zeros(self.n_clusters, dtype=sample_weight.dtype) # TODO: once the `k_means` function works with sparse input we @@ -1619,15 +1625,14 @@ def fit(self, X, y=None, sample_weight=None): # Initialize the centers using only a fraction of the data as we # expect n_samples to be very large when using MiniBatchKMeans cluster_centers = self._init_centroids( - X, x_squared_norms=x_squared_norms, init=self.init, + X, x_squared_norms=x_squared_norms, init=init, random_state=random_state, init_size=self._init_size) # Compute the label assignment on the init dataset _mini_batch_step( - X_valid, sample_weight_valid, - x_squared_norms[validation_indices], cluster_centers, - weight_sums, old_center_buffer, False, distances=None, - verbose=self.verbose) + X_valid, sample_weight_valid, x_squared_norms_valid, + cluster_centers, weight_sums, old_center_buffer, False, + distances=None, verbose=self.verbose) # Keep only the best cluster centers across independent inits on # the common validation set @@ -1635,8 +1640,8 @@ def fit(self, X, y=None, sample_weight=None): x_squared_norms_valid, cluster_centers) if self.verbose: - print("Inertia for init %d/%d: %f" - % (init_idx + 1, self._n_init, inertia)) + print(f"Inertia for init {init_idx + 1}/{self._n_init}: " + f"{inertia}") if best_inertia is None or inertia < best_inertia: self.cluster_centers_ = cluster_centers self.counts_ = weight_sums @@ -1709,7 +1714,7 @@ def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms, Returns ------- - labels : ndarray, shape (n_samples,) + labels : ndarray of shape (n_samples,) Cluster labels for each point. inertia : float @@ -1735,9 +1740,9 @@ def partial_fit(self, X, y=None, sample_weight=None): y : Ignored Not used, present here for API consistency by convention. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None). + are assigned equal weight. Returns ------- @@ -1784,8 +1789,8 @@ def partial_fit(self, X, y=None, sample_weight=None): # of features. if X.shape[1] != self.cluster_centers_.shape[1]: raise ValueError( - "Number of features %d does not match previous " - "data %d." % (X.shape[1], self.cluster_centers_.shape[1])) + f"Number of features {X.shape[1]} does not match previous " + f"data {self.cluster_centers_.shape[1]}.") _mini_batch_step(X, sample_weight, x_squared_norms, self.cluster_centers_, self.counts_, @@ -1813,13 +1818,13 @@ def predict(self, X, sample_weight=None): X : {array-like, sparse matrix} of shape (n_samples, n_features) New data to predict. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None). + are assigned equal weight. Returns ------- - labels : array, shape [n_samples,] + labels : ndarray of shape (n_samples,) Index of the cluster each sample belongs to. """ check_is_fitted(self) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index bf23d669da654..fd48c7b73842c 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -8,23 +8,21 @@ import pytest from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import assert_warns_message -from sklearn.utils._testing import assert_raise_message from sklearn.utils.validation import _num_samples from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning from sklearn.utils.extmath import row_norms +from sklearn.metrics import pairwise_distances from sklearn.metrics import pairwise_distances_argmin from sklearn.metrics.cluster import v_measure_score from sklearn.cluster import KMeans, k_means from sklearn.cluster import MiniBatchKMeans from sklearn.cluster._kmeans import _labels_inertia from sklearn.cluster._kmeans import _mini_batch_step +from sklearn.cluster._kmeans import _check_normalize_sample_weight from sklearn.cluster._k_means_fast import _relocate_empty_clusters_dense from sklearn.cluster._k_means_fast import _relocate_empty_clusters_sparse from sklearn.cluster._k_means_fast import _euclidean_dense_dense_wrapper @@ -33,7 +31,6 @@ from sklearn.cluster._k_means_fast import _inertia_sparse from sklearn.datasets import make_blobs from io import StringIO -from sklearn.metrics.cluster import homogeneity_score # non centered, sparse centers to check the @@ -49,12 +46,27 @@ X_csr = sp.csr_matrix(X) -@pytest.mark.parametrize("representation", ["dense", "sparse"]) +def _check_fitted_model(km): + # check that the number of clusters centers and distinct labels match + # the expectation + centers = km.cluster_centers_ + assert centers.shape == (n_clusters, n_features) + + labels = km.labels_ + assert np.unique(labels).shape[0] == n_clusters + + # check that the labels assignment are perfect (up to a permutation) + assert_allclose(v_measure_score(true_labels, labels), 1.0) + assert km.inertia_ > 0.0 + + +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) @pytest.mark.parametrize("algo", ["full", "elkan"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_kmeans_results(representation, algo, dtype): - # cheks that kmeans works as intended - array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation] +def test_kmeans_results(array_constr, algo, dtype): + # Checks that KMeans works as intended on toy dataset by comparing with + # expected results computed by hand. X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) @@ -68,954 +80,445 @@ def test_kmeans_results(representation, algo, dtype): kmeans.fit(X, sample_weight=sample_weight) assert_array_equal(kmeans.labels_, expected_labels) - assert_almost_equal(kmeans.inertia_, expected_inertia) - assert_array_almost_equal(kmeans.cluster_centers_, expected_centers) - assert kmeans.n_iter_ == expected_n_iter - - -@pytest.mark.parametrize("array_constr", - [np.array, sp.csr_matrix], - ids=['dense', 'sparse']) -@pytest.mark.parametrize("algo", ['full', 'elkan']) -def test_relocated_clusters(array_constr, algo): - # check that empty clusters are relocated as expected - X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) - - # second center too far from others points will be empty at first iter - init_centers = np.array([[0.5, 0.5], [3, 3]]) - - expected_labels = [0, 0, 1, 1] - expected_inertia = 0.25 - expected_centers = [[0.25, 0], [0.75, 1]] - expected_n_iter = 3 - - kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) - kmeans.fit(X) - - assert_array_equal(kmeans.labels_, expected_labels) - assert_almost_equal(kmeans.inertia_, expected_inertia) - assert_array_almost_equal(kmeans.cluster_centers_, expected_centers) + assert_allclose(kmeans.inertia_, expected_inertia) + assert_allclose(kmeans.cluster_centers_, expected_centers) assert kmeans.n_iter_ == expected_n_iter -@pytest.mark.parametrize("representation", ["dense", "sparse"]) -def test_relocate_empty_clusters(representation): - # test for the _relocate_empty_clusters_(dense/sparse) helpers - - # Synthetic dataset with 3 obvious clusters of different sizes - X = np.array( - [-10., -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1) - if representation == "sparse": - X = sp.csr_matrix(X) - sample_weight = np.full(shape=10, fill_value=1.) +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("algo", ["full", "elkan"]) +def test_k_means_1_iteration(array_constr, algo): + # check the results after a single iteration (E-step M-step E-step) by + # comparing against a pure python implementation. + X = np.random.RandomState(0).uniform(size=(100, 5)) + init_centers = X[:5] + X = array_constr(X) - # centers all initialized to the first point of X - centers_old = np.array([-10., -10, -10]).reshape(-1, 1) + def py_kmeans(X, init): + new_centers = init.copy() + labels = pairwise_distances_argmin(X, init) + for label in range(init.shape[0]): + new_centers[label] = X[labels == label].mean(axis=0) + labels = pairwise_distances_argmin(X, new_centers) + return labels, new_centers - # With this initialization, all points will be assigned to the first center - # At this point a center in centers_new is the weighted sum of the points - # it contains if it's not empty, otherwise it is the same as before. - centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1) - weight_in_clusters = np.array([10., 0, 0]) - labels = np.zeros(10, dtype=np.int32) + py_labels, py_centers = py_kmeans(X, init_centers) - if representation == "dense": - _relocate_empty_clusters_dense(X, sample_weight, centers_old, - centers_new, weight_in_clusters, labels) - else: - _relocate_empty_clusters_sparse(X.data, X.indices, X.indptr, - sample_weight, centers_old, - centers_new, weight_in_clusters, - labels) + cy_kmeans = KMeans(n_clusters=5, n_init=1, init=init_centers, + algorithm=algo, max_iter=1).fit(X) + cy_labels = cy_kmeans.labels_ + cy_centers = cy_kmeans.cluster_centers_ - # The relocation scheme will take the 2 points farthest from the center and - # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The - # first center will be updated to contain the other 8 points. - assert_array_equal(weight_in_clusters, [8, 1, 1]) - assert_allclose(centers_new, [[-36], [10], [9.5]]) + assert_array_equal(py_labels, cy_labels) + assert_allclose(py_centers, cy_centers) -@pytest.mark.parametrize('distribution', ['normal', 'blobs']) -@pytest.mark.parametrize('tol', [1e-2, 1e-4, 1e-8]) -def test_elkan_results(distribution, tol): - # check that results are identical between lloyd and elkan algorithms +@pytest.mark.parametrize("distribution", ["normal", "blobs"]) +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8]) +def test_elkan_results(distribution, array_constr, tol): + # Check that results are identical between lloyd and elkan algorithms rnd = np.random.RandomState(0) - if distribution == 'normal': + if distribution == "normal": X = rnd.normal(size=(5000, 10)) else: X, _ = make_blobs(random_state=rnd) + X[X < 0] = 0 + X = array_constr(X) - km_full = KMeans(algorithm='full', n_clusters=5, + km_full = KMeans(algorithm="full", n_clusters=5, random_state=0, n_init=1, tol=tol) - km_elkan = KMeans(algorithm='elkan', n_clusters=5, + km_elkan = KMeans(algorithm="elkan", n_clusters=5, random_state=0, n_init=1, tol=tol) km_full.fit(X) km_elkan.fit(X) assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) assert_array_equal(km_elkan.labels_, km_full.labels_) - assert km_elkan.n_iter_ == km_full.n_iter_ assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6) -@pytest.mark.parametrize('algorithm', ['full', 'elkan']) +@pytest.mark.parametrize("algorithm", ["full", "elkan"]) def test_kmeans_convergence(algorithm): # Check that KMeans stops when convergence is reached when tol=0. (#16075) + # We can only ensure that if the number of threads is not to large, + # otherwise the roundings errors coming from the unpredictability of + # the order in which chunks are processed make the convergence criterion + # to never be exactly 0. rnd = np.random.RandomState(0) X = rnd.normal(size=(5000, 10)) - km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0, n_init=1, - tol=0, max_iter=300).fit(X) + with threadpool_limits(limits=1, user_api="openmp"): + km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0, + n_init=1, tol=0, max_iter=300).fit(X) assert km.n_iter_ < 300 -@pytest.mark.parametrize('distribution', ['normal', 'blobs']) -def test_elkan_results_sparse(distribution): - # check that results are identical between lloyd and elkan algorithms - # with sparse input - rnd = np.random.RandomState(0) - if distribution == 'normal': - X = sp.random(100, 100, density=0.1, format='csr', random_state=rnd) - X.data = rnd.randn(len(X.data)) - else: - X, _ = make_blobs(n_samples=100, n_features=100, random_state=rnd) - X = sp.csr_matrix(X) - - km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1) - km_elkan = KMeans(algorithm='elkan', n_clusters=5, - random_state=0, n_init=1) - - km_full.fit(X) - km_elkan.fit(X) - assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) - assert_allclose(km_elkan.labels_, km_full.labels_) - - -def test_labels_assignment_and_inertia(): - # pure numpy implementation as easily auditable reference gold - # implementation - rng = np.random.RandomState(42) - noisy_centers = centers + rng.normal(size=centers.shape) - labels_gold = np.full(n_samples, -1, dtype=np.int) - mindist = np.empty(n_samples) - mindist.fill(np.infty) - for center_id in range(n_clusters): - dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1) - labels_gold[dist < mindist] = center_id - mindist = np.minimum(dist, mindist) - inertia_gold = mindist.sum() - assert (mindist >= 0.0).all() - assert (labels_gold != -1).all() - - sample_weight = np.ones(X.shape[0], dtype=X.dtype) - - # perform label assignment using the dense array input - x_squared_norms = (X ** 2).sum(axis=1) - labels_array, inertia_array = _labels_inertia( - X, sample_weight, x_squared_norms, noisy_centers) - assert_array_almost_equal(inertia_array, inertia_gold) - assert_array_equal(labels_array, labels_gold) - - # perform label assignment using the sparse CSR input - x_squared_norms_from_csr = row_norms(X_csr, squared=True) - labels_csr, inertia_csr = _labels_inertia( - X_csr, sample_weight, x_squared_norms_from_csr, noisy_centers) - assert_array_almost_equal(inertia_csr, inertia_gold) - assert_array_equal(labels_csr, labels_gold) - - -def test_minibatch_update_consistency(): - # Check that dense and sparse minibatch update give the same results - rng = np.random.RandomState(42) - old_centers = centers + rng.normal(size=centers.shape) - - new_centers = old_centers.copy() - new_centers_csr = old_centers.copy() - - weight_sums = np.zeros(new_centers.shape[0], dtype=np.double) - weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double) - - x_squared_norms = (X ** 2).sum(axis=1) - x_squared_norms_csr = row_norms(X_csr, squared=True) - - buffer = np.zeros(centers.shape[1], dtype=np.double) - buffer_csr = np.zeros(centers.shape[1], dtype=np.double) - - # extract a small minibatch - X_mb = X[:10] - X_mb_csr = X_csr[:10] - x_mb_squared_norms = x_squared_norms[:10] - x_mb_squared_norms_csr = x_squared_norms_csr[:10] - - sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double) - - # step 1: compute the dense minibatch update - old_inertia, incremental_diff = _mini_batch_step( - X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums, - buffer, 1, None, random_reassign=False) - assert old_inertia > 0.0 - - # compute the new inertia on the same batch to check that it decreased - labels, new_inertia = _labels_inertia( - X_mb, sample_weight_mb, x_mb_squared_norms, new_centers) - assert new_inertia > 0.0 - assert new_inertia < old_inertia - - # check that the incremental difference computation is matching the - # final observed value - effective_diff = np.sum((new_centers - old_centers) ** 2) - assert_almost_equal(incremental_diff, effective_diff) - - # step 2: compute the sparse minibatch update - old_inertia_csr, incremental_diff_csr = _mini_batch_step( - X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr, - weight_sums_csr, buffer_csr, 1, None, random_reassign=False) - assert old_inertia_csr > 0.0 - - # compute the new inertia on the same batch to check that it decreased - labels_csr, new_inertia_csr = _labels_inertia( - X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr) - assert new_inertia_csr > 0.0 - assert new_inertia_csr < old_inertia_csr - - # check that the incremental difference computation is matching the - # final observed value - effective_diff = np.sum((new_centers_csr - old_centers) ** 2) - assert_almost_equal(incremental_diff_csr, effective_diff) - - # step 3: check that sparse and dense updates lead to the same results - assert_array_equal(labels, labels_csr) - assert_array_almost_equal(new_centers, new_centers_csr) - assert_almost_equal(incremental_diff, incremental_diff_csr) - assert_almost_equal(old_inertia, old_inertia_csr) - assert_almost_equal(new_inertia, new_inertia_csr) - - -def _check_fitted_model(km): - # check that the number of clusters centers and distinct labels match - # the expectation - centers = km.cluster_centers_ - assert centers.shape == (n_clusters, n_features) - - labels = km.labels_ - assert np.unique(labels).shape[0] == n_clusters - - # check that the labels assignment are perfect (up to a permutation) - assert v_measure_score(true_labels, labels) == 1.0 - assert km.inertia_ > 0.0 - - # check error on dataset being too small - assert_raise_message(ValueError, "n_samples=1 should be >= n_clusters=%d" - % km.n_clusters, km.fit, [[0., 1.]]) - - -def test_k_means_new_centers(): - # Explore the part of the code where a new center is reassigned - X = np.array([[0, 0, 1, 1], - [0, 0, 0, 0], - [0, 1, 0, 0], - [0, 0, 0, 0], - [0, 0, 0, 0], - [0, 1, 0, 0]]) - labels = [0, 1, 2, 1, 1, 2] - bad_centers = np.array([[+0, 1, 0, 0], - [.2, 0, .2, .2], - [+0, 0, 0, 0]]) - - km = KMeans(n_clusters=3, init=bad_centers, n_init=1, max_iter=10, - random_state=1) - for this_X in (X, sp.coo_matrix(X)): - km.fit(this_X) - this_labels = km.labels_ - # Reorder the labels so that the first instance is in cluster 0, - # the second in cluster 1, ... - this_labels = np.unique(this_labels, return_index=True)[1][this_labels] - np.testing.assert_array_equal(this_labels, labels) - - -@pytest.mark.parametrize('data', [X, X_csr], ids=['dense', 'sparse']) -@pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()]) -def test_k_means_init(data, init): - km = KMeans(init=init, n_clusters=n_clusters, random_state=42, n_init=1) - km.fit(data) +@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) +@pytest.mark.parametrize("init", ["random", "k-means++", centers, + lambda X, k, random_state: centers], + ids=["random", "k-means++", "ndarray", "callable"]) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_all_init(estimator, data, init): + # Check KMeans and MiniBatchKMeans with all possible init. + km = estimator(init=init, n_clusters=n_clusters, random_state=42, + n_init=10).fit(data) _check_fitted_model(km) -def test_k_means_n_init(): - rnd = np.random.RandomState(0) - X = rnd.normal(size=(40, 2)) - - # two regression tests on bad n_init argument - # previous bug: n_init <= 0 threw non-informative TypeError (#3858) - with pytest.raises(ValueError, match="n_init"): - KMeans(n_init=0).fit(X) - with pytest.raises(ValueError, match="n_init"): - KMeans(n_init=-1).fit(X) - - -@pytest.mark.parametrize('Class', [KMeans, MiniBatchKMeans]) -def test_k_means_explicit_init_shape(Class): - # test for sensible errors when giving explicit init - # with wrong number of features or clusters +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_result_of_kmeans_equal_in_diff_n_threads(estimator): + # Check that KMeans gives the same results in parallel mode than in + # sequential mode. rnd = np.random.RandomState(0) - X = rnd.normal(size=(40, 3)) + X = rnd.normal(size=(50, 10)) - # mismatch of number of features - km = Class(n_init=1, init=X[:, :2], n_clusters=len(X)) - msg = "does not match the number of features of the data" - with pytest.raises(ValueError, match=msg): - km.fit(X) - # for callable init - km = Class(n_init=1, - init=lambda X_, k, random_state: X_[:, :2], - n_clusters=len(X)) - with pytest.raises(ValueError, match=msg): - km.fit(X) - # mismatch of number of clusters - msg = "does not match the number of clusters" - km = Class(n_init=1, init=X[:2, :], n_clusters=3) - with pytest.raises(ValueError, match=msg): - km.fit(X) - # for callable init - km = Class(n_init=1, - init=lambda X_, k, random_state: X_[:2, :], - n_clusters=3) - with pytest.raises(ValueError, match=msg): - km.fit(X) + with threadpool_limits(limits=1, user_api="openmp"): + result_1 = estimator( + n_clusters=n_clusters, random_state=0).fit(X).labels_ + with threadpool_limits(limits=2, user_api="openmp"): + result_2 = estimator( + n_clusters=n_clusters, random_state=0).fit(X).labels_ + assert_array_equal(result_1, result_2) -def test_k_means_fortran_aligned_data(): - # Check the KMeans will work well, even if X is a fortran-aligned data. - X = np.asfortranarray([[0, 0], [0, 1], [0, 1]]) - centers = np.array([[0, 0], [0, 1]]) - labels = np.array([0, 1, 1]) - km = KMeans(n_init=1, init=centers, random_state=42, n_clusters=2) - km.fit(X) - assert_array_almost_equal(km.cluster_centers_, centers) - assert_array_equal(km.labels_, labels) - - -@pytest.mark.parametrize('algo', ['full', 'elkan']) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) -@pytest.mark.parametrize('constructor', [np.asarray, sp.csr_matrix]) -@pytest.mark.parametrize('seed, max_iter, tol', [ - (0, 2, 1e-7), # strict non-convergence - (1, 2, 1e-1), # loose non-convergence - (3, 300, 1e-7), # strict convergence - (4, 300, 1e-1), # loose convergence -]) -def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol): - # check that fit.predict gives same result as fit_predict - # There's a very small chance of failure with elkan on unstructured dataset - # because predict method uses fast euclidean distances computation which - # may cause small numerical instabilities. - # NB: This test is largely redundant with respect to test_predict and - # test_predict_equal_labels. This test has the added effect of - # testing idempotence of the fittng procesdure which appears to - # be where it fails on some MacOS setups. - if sys.platform == "darwin": - pytest.xfail( - "Known failures on MacOS, See " - "https://github.com/scikit-learn/scikit-learn/issues/12644") +def test_check_normalize_sample_weight(): + # Check the check sample weight helper. sample weights should sum to + # n_samples + sample_weight = None + checked_sample_weight = _check_normalize_sample_weight(sample_weight, X) + assert _num_samples(X) == _num_samples(checked_sample_weight) + assert_almost_equal(checked_sample_weight.sum(), _num_samples(X)) + assert X.dtype == checked_sample_weight.dtype - rng = np.random.RandomState(seed) - X = make_blobs(n_samples=1000, n_features=10, centers=10, - random_state=rng)[0].astype(dtype, copy=False) - X = constructor(X) +def _sort_centers(centers): + return np.sort(centers, axis=0) - kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed, - tol=tol, max_iter=max_iter) - labels_1 = kmeans.fit(X).predict(X) - labels_2 = kmeans.fit_predict(X) +@pytest.mark.parametrize("init", ["k-means++", centers], + ids=["k-means++", "ndarray"]) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_weighted_vs_repeated(estimator, init): + # Check that a sample weight of N should yield the same result as an N-fold + # repetition of the sample + sample_weight = np.random.RandomState(0).randint(1, 5, size=n_samples) + X_repeat = np.repeat(X, sample_weight, axis=0) - # Due to randomness in the order in which chunks of data are processed when - # using more than one thread, the absolute values of the labels can be - # different between the 2 strategies but they should correspond to the same - # clustering. - assert v_measure_score(labels_1, labels_2) == 1 + km = estimator(init=init, n_clusters=n_clusters, random_state=0) + if estimator is MiniBatchKMeans: + km.set_params(batch_size=10) + km_weighted = clone(km).fit(X, sample_weight=sample_weight) + repeated_labels = np.repeat(km_weighted.labels_, sample_weight) + km_repeated = clone(km).fit(X_repeat) -def test_mb_kmeans_verbose(): - mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, - random_state=42, verbose=1) - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - mb_k_means.fit(X) - finally: - sys.stdout = old_stdout + # We can't expect labels to be equal because k-means++ will lead to + # a different initialization on duplicated X. + assert_allclose(v_measure_score(km_repeated.labels_, repeated_labels), 1) + # TODO: FIXME + if estimator is not MiniBatchKMeans: + assert_allclose(_sort_centers(km_weighted.cluster_centers_), + _sort_centers(km_repeated.cluster_centers_)) -def test_minibatch_init_with_large_k(): - mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20) - # Check that a warning is raised, as the number clusters is larger - # than the init_size - assert_warns(RuntimeWarning, mb_k_means.fit, X) - - -def test_minibatch_k_means_init_multiple_runs_with_explicit_centers(): - mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, - random_state=42, n_init=10) - assert_warns(RuntimeWarning, mb_k_means.fit, X) - - -@pytest.mark.parametrize('data', [X, X_csr], ids=['dense', 'sparse']) -@pytest.mark.parametrize('init', ["random", 'k-means++', centers.copy()]) -def test_minibatch_k_means_init(data, init): - mb_k_means = MiniBatchKMeans(init=init, n_clusters=n_clusters, - random_state=42, n_init=10) - mb_k_means.fit(data) - _check_fitted_model(mb_k_means) - - -def test_minibatch_sensible_reassign_fit(): - # check if identical initial clusters are reassigned - # also a regression test for when there are more desired reassignments than - # samples. - zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, - cluster_std=1., random_state=42) - zeroed_X[::2, :] = 0 - mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, - init="random") - mb_k_means.fit(zeroed_X) - # there should not be too many exact zero cluster centers - assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10 - - # do the same with batch-size > X.shape[0] (regression test) - mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201, - random_state=42, init="random") - mb_k_means.fit(zeroed_X) - # there should not be too many exact zero cluster centers - assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10 - - -def test_minibatch_sensible_reassign_partial_fit(): - zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5, - cluster_std=1., random_state=42) - zeroed_X[::2, :] = 0 - mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random") - for i in range(100): - mb_k_means.partial_fit(zeroed_X) - # there should not be too many exact zero cluster centers - assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10 - - -def test_minibatch_reassign(): - # Give a perfect initialization, but a large reassignment_ratio, - # as a result all the centers should be reassigned and the model - # should no longer be good - sample_weight = np.ones(X.shape[0], dtype=X.dtype) - for this_X in (X, X_csr): - mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, - random_state=42) - mb_k_means.fit(this_X) - - score_before = mb_k_means.score(this_X) - try: - old_stdout = sys.stdout - sys.stdout = StringIO() - # Turn on verbosity to smoke test the display code - _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), - mb_k_means.cluster_centers_, - mb_k_means.counts_, - np.zeros(X.shape[1], np.double), - False, distances=np.zeros(X.shape[0]), - random_reassign=True, random_state=42, - reassignment_ratio=1, verbose=True) - finally: - sys.stdout = old_stdout - assert score_before > mb_k_means.score(this_X) - - # Give a perfect initialization, with a small reassignment_ratio, - # no center should be reassigned - for this_X in (X, X_csr): - mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, - init=centers.copy(), - random_state=42, n_init=1) - mb_k_means.fit(this_X) - clusters_before = mb_k_means.cluster_centers_ - # Turn on verbosity to smoke test the display code - _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), - mb_k_means.cluster_centers_, - mb_k_means.counts_, - np.zeros(X.shape[1], np.double), - False, distances=np.zeros(X.shape[0]), - random_reassign=True, random_state=42, - reassignment_ratio=1e-15) - assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_) - - -def test_minibatch_with_many_reassignments(): - # Test for the case that the number of clusters to reassign is bigger - # than the batch_size - n_samples = 550 - rnd = np.random.RandomState(42) - X = rnd.uniform(size=(n_samples, 10)) - # Check that the fit works if n_clusters is bigger than the batch_size. - # Run the test with 550 clusters and 550 samples, because it turned out - # that this values ensure that the number of clusters to reassign - # is always bigger than the batch_size - n_clusters = 550 - MiniBatchKMeans(n_clusters=n_clusters, - batch_size=100, - init_size=n_samples, - random_state=42).fit(X) - - -def test_sparse_mb_k_means_callable_init(): - - def test_init(X, k, random_state): - return centers - - # Small test to check that giving the wrong number of centers - # raises a meaningful error - msg = "does not match the number of clusters" - with pytest.raises(ValueError, match=msg): - MiniBatchKMeans(init=test_init, random_state=42).fit(X_csr) - - # Now check that the fit actually works - mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init, - random_state=42).fit(X_csr) - _check_fitted_model(mb_k_means) - - -def test_mini_batch_k_means_random_init_partial_fit(): - km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42) - # use the partial_fit API for online learning - for X_minibatch in np.array_split(X, 10): - km.partial_fit(X_minibatch) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_unit_weights_vs_no_weights(estimator): + # Check that not passing sample weights should be equivalent to passing + # sample weights all equal to one. + sample_weight = np.ones(n_samples) - # compute the labeling on the complete dataset - labels = km.predict(X) - assert v_measure_score(true_labels, labels) == 1.0 + km = estimator(n_clusters=n_clusters, random_state=42) + km_none = clone(km).fit(X, sample_weight=None) + km_ones = clone(km).fit(X, sample_weight=sample_weight) + assert_array_equal(km_none.labels_, km_ones.labels_) + assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_) -def test_minibatch_default_init_size(): - mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, - batch_size=10, random_state=42, - n_init=1).fit(X) - assert mb_k_means._init_size == 3 * mb_k_means.batch_size - _check_fitted_model(mb_k_means) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_scaled_weights(estimator): + # Check that scaling all sample weights by a common factor + # shouldn't change the result + sample_weight = np.random.uniform(n_samples) -def test_minibatch_tol(): - mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, - random_state=42, tol=.01).fit(X) - _check_fitted_model(mb_k_means) + km = estimator(n_clusters=n_clusters, random_state=42) + km_orig = clone(km).fit(X, sample_weight=sample_weight) + km_scaled = clone(km).fit(X, sample_weight=0.5 * sample_weight) + assert_array_equal(km_orig.labels_, km_scaled.labels_) + assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_) -def test_minibatch_set_init_size(): - mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, - init_size=666, random_state=42, - n_init=1).fit(X) - assert mb_k_means.init_size == 666 - assert mb_k_means._init_size == n_samples - _check_fitted_model(mb_k_means) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_fortran_aligned_data(estimator): + # Check that KMeans works with fortran-aligned data. + X_fortran = np.asfortranarray(X) + centers_fortran = np.asfortranarray(centers) -@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) -def test_k_means_invalid_init(Estimator): - km = Estimator(init="invalid", n_init=1, n_clusters=n_clusters) - with pytest.raises(ValueError): - km.fit(X) + km_c = estimator(n_clusters=n_clusters, init=centers, n_init=1, + random_state=42).fit(X) + km_f = estimator(n_clusters=n_clusters, init=centers_fortran, n_init=1, + random_state=42).fit(X_fortran) + assert_allclose(km_c.cluster_centers_, km_f.cluster_centers_) + assert_array_equal(km_c.labels_, km_f.labels_) def test_k_means_copyx(): - # Check if copy_x=False returns nearly equal X after de-centering. + # Check that copy_x=False returns nearly equal X after de-centering. my_X = X.copy() km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42) km.fit(my_X) _check_fitted_model(km) - # check if my_X is centered - assert_array_almost_equal(my_X, X) + # check that my_X is de-centered + assert_allclose(my_X, X) -def test_k_means_non_collapsed(): - # Check k_means with a bad initialization does not yield a singleton - # Starting with bad centers that are quickly ignored should not - # result in a repositioning of the centers to the center of mass that - # would lead to collapsed centers which in turns make the clustering - # dependent of the numerical unstabilities. - my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]]) - array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]]) - km = KMeans(init=array_init, n_clusters=3, random_state=42, n_init=1) - km.fit(my_X) - - # centers must not been collapsed - assert len(np.unique(km.labels_)) == 3 - - centers = km.cluster_centers_ - assert np.linalg.norm(centers[0] - centers[1]) >= 0.1 - assert np.linalg.norm(centers[0] - centers[2]) >= 0.1 - assert np.linalg.norm(centers[1] - centers[2]) >= 0.1 - - -@pytest.mark.parametrize('algo', ['full', 'elkan']) -def test_score(algo): - # Check that fitting k-means with multiple inits gives better score - km1 = KMeans(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1, - algorithm=algo) - s1 = km1.fit(X).score(X) - km2 = KMeans(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1, - algorithm=algo) - s2 = km2.fit(X).score(X) - assert s2 > s1 +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_centers_not_mutated(estimator, dtype): + # Check that KMeans and MiniBatchKMeans won't mutate the user provided + # init centers silently even if input data and init centers have the same + # type. + X_new_type = X.astype(dtype, copy=True) + centers_new_type = centers.astype(dtype, copy=True) + km = estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1) + km.fit(X_new_type) -@pytest.mark.parametrize('Estimator', [KMeans, MiniBatchKMeans]) -@pytest.mark.parametrize('data', [X, X_csr], ids=['dense', 'sparse']) -@pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()]) -def test_predict(Estimator, data, init): - k_means = Estimator(n_clusters=n_clusters, init=init, - n_init=10, random_state=0).fit(data) + assert not np.may_share_memory(km.cluster_centers_, centers) - # sanity check: re-predict labeling for training set samples - assert_array_equal(k_means.predict(data), k_means.labels_) - # sanity check: predict centroid labels - pred = k_means.predict(k_means.cluster_centers_) - assert_array_equal(pred, np.arange(n_clusters)) - - # re-predict labels for training set using fit_predict - pred = k_means.fit_predict(data) - assert_array_equal(pred, k_means.labels_) +@pytest.mark.parametrize("data", [X, X_csr], ids=["sparse", "dense"]) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_float_precision(estimator, data): + km = estimator(n_init=1, random_state=0) + inertia = {} + Xt = {} + centers = {} + labels = {} -@pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()]) -def test_predict_minibatch_dense_sparse(init): - # check that models trained on sparse input also works for dense input at - # predict time - mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init=init, - n_init=10, random_state=0).fit(X_csr) - - assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_) - - -def test_int_input(): - X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]] - for dtype in [np.int32, np.int64]: - X_int = np.array(X_list, dtype=dtype) - X_int_csr = sp.csr_matrix(X_int) - init_int = X_int[:2] - - fitted_models = [ - KMeans(n_clusters=2).fit(X_int), - KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int), - # mini batch kmeans is very unstable on such a small dataset hence - # we use many inits - MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int), - MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit( - X_int_csr), - MiniBatchKMeans(n_clusters=2, batch_size=2, - init=init_int, n_init=1).fit(X_int), - MiniBatchKMeans(n_clusters=2, batch_size=2, - init=init_int, n_init=1).fit(X_int_csr), - ] - - for km in fitted_models: - assert km.cluster_centers_.dtype == np.float64 - - expected_labels = [0, 1, 1, 0, 0, 1] - scores = np.array([v_measure_score(expected_labels, km.labels_) - for km in fitted_models]) - assert_array_almost_equal(scores, np.ones(scores.shape[0])) - - -def test_transform(): - km = KMeans(n_clusters=n_clusters) - km.fit(X) - X_new = km.transform(km.cluster_centers_) - - for c in range(n_clusters): - assert X_new[c, c] == 0 - for c2 in range(n_clusters): - if c != c2: - assert X_new[c, c2] > 0 - - -def test_fit_transform(): - X1 = KMeans(n_clusters=3, random_state=51).fit(X).transform(X) - X2 = KMeans(n_clusters=3, random_state=51).fit_transform(X) - assert_array_almost_equal(X1, X2) - - -@pytest.mark.parametrize('algo', ['full', 'elkan']) -def test_predict_equal_labels(algo): - km = KMeans(random_state=13, n_init=1, max_iter=1, - algorithm=algo) - km.fit(X) - assert_array_equal(km.predict(X), km.labels_) - + for dtype in [np.float64, np.float32]: + X = data.astype(dtype) + km.fit(X) -def test_full_vs_elkan(): - km1 = KMeans(algorithm='full', random_state=13).fit(X) - km2 = KMeans(algorithm='elkan', random_state=13).fit(X) + inertia[dtype] = km.inertia_ + Xt[dtype] = km.transform(X) + centers[dtype] = km.cluster_centers_ + labels[dtype] = km.labels_ - assert homogeneity_score(km1.predict(X), km2.predict(X)) == 1.0 + # dtype of cluster centers has to be the dtype of the input data + assert km.cluster_centers_.dtype == dtype + # same with partial_fit + if estimator is MiniBatchKMeans: + km.partial_fit(X[0:3]) + assert km.cluster_centers_.dtype == dtype -def test_n_init(): - # Check that increasing the number of init increases the quality - n_runs = 5 - n_init_range = [1, 5, 10] - inertia = np.zeros((len(n_init_range), n_runs)) - for i, n_init in enumerate(n_init_range): - for j in range(n_runs): - km = KMeans(n_clusters=n_clusters, init="random", n_init=n_init, - random_state=j).fit(X) - inertia[i, j] = km.inertia_ + # compare arrays with low precision since the difference between + # 32 and 64 bit sometimes makes a difference up to the 4th decimal + # place + assert_allclose(inertia[np.float32], inertia[np.float64], rtol=1e-5) + assert_allclose(Xt[np.float32], Xt[np.float64], rtol=1e-5) + assert_allclose(centers[np.float32], centers[np.float64], rtol=1e-5) + assert_array_equal(labels[np.float32], labels[np.float64]) - inertia = inertia.mean(axis=1) - failure_msg = ("Inertia %r should be decreasing" - " when n_init is increasing.") % list(inertia) - for i in range(len(n_init_range) - 1): - assert inertia[i] >= inertia[i + 1], failure_msg +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_score_multiple_inits(estimator): + # Check that fitting KMeans or MiniBatchKMeans with multiple inits gives + # better score + X = np.random.RandomState(0).randn(100, 10) -def test_k_means_function(): - # test calling the k_means function directly - # catch output - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, - sample_weight=None, - verbose=True) - finally: - sys.stdout = old_stdout - centers = cluster_centers - assert centers.shape == (n_clusters, n_features) + km1 = estimator(max_iter=10, random_state=42, n_init=1) + s1 = km1.fit(X).score(X) + km2 = estimator(max_iter=10, random_state=42, n_init=10) + s2 = km2.fit(X).score(X) + assert s2 > s1 - labels = labels - assert np.unique(labels).shape[0] == n_clusters - # check that the labels assignment are perfect (up to a permutation) - assert v_measure_score(true_labels, labels) == 1.0 - assert inertia > 0.0 +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_score_max_iter(estimator): + # Check that fitting KMeans or MiniBatchKMeans with more iterations gives + # better score + X = np.random.RandomState(0).randn(100, 10) - # check warning when centers are passed - assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, - sample_weight=None, init=centers) + km1 = estimator(n_init=1, random_state=42, max_iter=1) + s1 = km1.fit(X).score(X) + km2 = estimator(n_init=1, random_state=42, max_iter=10) + s2 = km2.fit(X).score(X) + assert s2 > s1 - # to many clusters desired - with pytest.raises(ValueError): - k_means(X, n_clusters=X.shape[0] + 1, sample_weight=None) +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("dtype", [np.int32, np.int64]) +@pytest.mark.parametrize("init", ["k-means++", "ndarray"]) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_integer_input(estimator, array_constr, dtype, init): + # Check that KMeans and MiniBatchKMeans work with integer input. + X_dense = np.array([[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]) + X = array_constr(X_dense, dtype=dtype) -def test_x_squared_norms_init_centroids(): - # Test that x_squared_norms can be None in _init_centroids - from sklearn.cluster._kmeans import _init_centroids + n_init = 1 if init == "ndarray" else 10 + init = X_dense[:2] if init == "ndarray" else init - X_norms = np.sum(X**2, axis=1) - precompute = _init_centroids( - X, 3, "k-means++", random_state=0, x_squared_norms=X_norms) - assert_array_almost_equal( - precompute, - _init_centroids(X, 3, "k-means++", random_state=0)) + km = estimator(n_clusters=2, init=init, n_init=n_init, random_state=0) + if estimator is MiniBatchKMeans: + km.set_params(batch_size=2) + km.fit(X) -def test_max_iter_error(): - km = KMeans(max_iter=-1) - assert_raise_message(ValueError, 'Number of iterations should be', - km.fit, X) + # Internally integer input should be converted to float64 + assert km.cluster_centers_.dtype == np.float64 + expected_labels = [0, 1, 1, 0, 0, 1] + assert_allclose(v_measure_score(km.labels_, expected_labels), 1) -@pytest.mark.parametrize('Estimator', [KMeans, MiniBatchKMeans]) -@pytest.mark.parametrize('is_sparse', [False, True]) -def test_float_precision(Estimator, is_sparse): + # Same with partial_fit (#14314) + if estimator is MiniBatchKMeans: + km = clone(km).partial_fit(X) + assert km.cluster_centers_.dtype == np.float64 - estimator = Estimator(n_init=1, random_state=30) - inertia = {} - X_new = {} - centers = {} +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("init", ["random", "k-means++", "ndarray"]) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_predict(estimator, init, dtype, array_constr): + # Check the predict method and the equivalence between fit.predict and + # fit_predict. + if sys.platform == "darwin": + pytest.xfail( + "Known failures on MacOS, See " + "https://github.com/scikit-learn/scikit-learn/issues/12644") - for dtype in [np.float64, np.float32]: - if is_sparse: - X_test = sp.csr_matrix(X_csr, dtype=dtype) - else: - X_test = X.astype(dtype) - estimator.fit(X_test) - # dtype of cluster centers has to be the dtype of the input - # data - assert estimator.cluster_centers_.dtype == dtype - inertia[dtype] = estimator.inertia_ - X_new[dtype] = estimator.transform(X_test) - centers[dtype] = estimator.cluster_centers_ - # ensure the extracted row is a 2d array - assert estimator.predict(X_test[:1]) == estimator.labels_[0] - if hasattr(estimator, 'partial_fit'): - estimator.partial_fit(X_test[0:3]) - # dtype of cluster centers has to stay the same after - # partial_fit - assert estimator.cluster_centers_.dtype == dtype + X, _ = make_blobs(n_samples=500, n_features=10, centers=10, random_state=0) - # compare arrays with low precision since the difference between - # 32 and 64 bit sometimes makes a difference up to the 4th decimal - # place - assert_array_almost_equal(inertia[np.float32], inertia[np.float64], - decimal=4) - assert_array_almost_equal(X_new[np.float32], X_new[np.float64], - decimal=4) - assert_array_almost_equal(centers[np.float32], centers[np.float64], - decimal=4) - - -def test_k_means_init_centers(): - # This test is used to check KMeans won't mutate the user provided input - # array silently even if input data and init centers have the same type - X_small = np.array([[1.1, 1.1], [-7.5, -7.5], [-1.1, -1.1], [7.5, 7.5]]) - init_centers = np.array([[0.0, 0.0], [5.0, 5.0], [-5.0, -5.0]]) - for dtype in [np.int32, np.int64, np.float32, np.float64]: - X_test = dtype(X_small) - init_centers_test = dtype(init_centers) - assert_array_equal(init_centers, init_centers_test) - km = KMeans(init=init_centers_test, n_clusters=3, n_init=1) - km.fit(X_test) - assert np.may_share_memory(km.cluster_centers_, - init_centers) is False + n_init = 1 if init == "ndarray" else 10 + init = X[:10] if init == "ndarray" else init + X = array_constr(X) + km = estimator(n_clusters=10, init=init, n_init=n_init, + random_state=0).fit(X) + labels = km.labels_ -@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) -def test_k_means_init_fitted_centers(data): - # Get a local optimum - centers = KMeans(n_clusters=3).fit(X).cluster_centers_ + # Due to randomness in the order in which chunks of data are processed when + # using more than one thread, there might be different rounding errors for + # the computation of the inertia for each init between 2 runs. This might + # result in a different ranking of the inits, hence a different labeling, + # which should still correspond to the same clustering - # Fit starting from a local optimum shouldn't change the solution - new_centers = KMeans(n_clusters=3, init=centers, - n_init=1).fit(X).cluster_centers_ - assert_array_almost_equal(centers, new_centers) + # re-predict labels for training set using predict + pred = km.predict(X) + assert_allclose(v_measure_score(pred, labels), 1) + # re-predict labels for training set using fit_predict + pred = km.fit_predict(X) + assert_allclose(v_measure_score(pred, labels), 1) -def test_sparse_validate_centers(): - from sklearn.datasets import load_iris + # predict centroid labels + pred = km.predict(km.cluster_centers_) + assert_allclose(v_measure_score(pred, np.arange(10)), 1) - iris = load_iris() - X = iris.data - # Get a local optimum - centers = KMeans(n_clusters=4).fit(X).cluster_centers_ +@pytest.mark.parametrize("init", ["random", "k-means++", centers], + ids=["random", "k-means++", "ndarray"]) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_predict_dense_sparse(estimator, init): + # check that models trained on sparse input also works for dense input at + # predict time and vice versa. + km = estimator(n_clusters=n_clusters, init=init, n_init=10, random_state=0) - # Test that a ValueError is raised for validate_center_shape - classifier = KMeans(n_clusters=3, init=centers, n_init=1) + km.fit(X_csr) + assert_array_equal(km.predict(X), km.labels_) - msg = r"The shape of the initial centers \(\(4L?, 4L?\)\) " \ - "does not match the number of clusters 3" - with pytest.raises(ValueError, match=msg): - classifier.fit(X) + km.fit(X) + assert_array_equal(km.predict(X_csr), km.labels_) -def test_less_centers_than_unique_points(): - X = np.asarray([[0, 0], - [0, 1], - [1, 0], - [1, 0]]) # last point is duplicated +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_transform(estimator): + # Check the transform method + km = estimator(n_clusters=n_clusters).fit(X) - km = KMeans(n_clusters=4).fit(X) + # Transorfming cluster_centers_ should return the pairwise distances + # between centers + Xt = km.transform(km.cluster_centers_) + assert_allclose(Xt, pairwise_distances(km.cluster_centers_)) + # In particular, diagonal must be 0 + assert_array_equal(Xt.diagonal(), np.zeros(n_clusters)) - # only three distinct points, so only three clusters - # can have points assigned to them - assert set(km.labels_) == set(range(3)) + # Transorfming X should return the pairwise distances between X and the + # centers + Xt = km.transform(X) + assert_allclose(Xt, pairwise_distances(X, km.cluster_centers_)) - # k_means should warn that fewer labels than cluster - # centers have been used - msg = ("Number of distinct clusters (3) found smaller than " - "n_clusters (4). Possibly due to duplicate points in X.") - assert_warns_message(ConvergenceWarning, msg, k_means, X, - sample_weight=None, n_clusters=4) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_fit_transform(estimator): + # Check equivalence between fit.transform and fit_transform + X1 = estimator(n_clusters=n_clusters, random_state=0).fit(X).transform(X) + X2 = estimator(n_clusters=n_clusters, random_state=0).fit_transform(X) + assert_allclose(X1, X2) -def _sort_centers(centers): - return np.sort(centers, axis=0) +@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) +def test_k_means_init_fitted_centers(data): + # Check that starting fitting from a local optimum shouldn't change the + # solution + km1 = KMeans(n_clusters=n_clusters).fit(data) + km2 = KMeans(n_clusters=n_clusters, init=km1.cluster_centers_, + n_init=1).fit(data) -def test_weighted_vs_repeated(): - # a sample weight of N should yield the same result as an N-fold - # repetition of the sample - rng = np.random.RandomState(0) - sample_weight = rng.randint(1, 5, size=n_samples) - X_repeat = np.repeat(X, sample_weight, axis=0) - estimators = [KMeans(init="k-means++", n_clusters=n_clusters, - random_state=42), - KMeans(init="random", n_clusters=n_clusters, - random_state=42), - KMeans(init=centers.copy(), n_clusters=n_clusters, - random_state=42), - MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, - random_state=42)] - for estimator in estimators: - est_weighted = clone(estimator).fit(X, sample_weight=sample_weight) - est_repeated = clone(estimator).fit(X_repeat) - repeated_labels = np.repeat(est_weighted.labels_, sample_weight) - assert_almost_equal(v_measure_score(est_repeated.labels_, - repeated_labels), 1.0) - if not isinstance(estimator, MiniBatchKMeans): - assert_almost_equal(_sort_centers(est_weighted.cluster_centers_), - _sort_centers(est_repeated.cluster_centers_)) - - -def test_unit_weights_vs_no_weights(): - # not passing any sample weights should be equivalent - # to all weights equal to one - sample_weight = np.ones(n_samples) - for estimator in [KMeans(n_clusters=n_clusters, random_state=42), - MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]: - est_1 = clone(estimator).fit(X) - est_2 = clone(estimator).fit(X, sample_weight=sample_weight) - assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) - assert_almost_equal(_sort_centers(est_1.cluster_centers_), - _sort_centers(est_2.cluster_centers_)) + assert_allclose(km1.cluster_centers_, km2.cluster_centers_) -def test_scaled_weights(): - # scaling all sample weights by a common factor - # shouldn't change the result - sample_weight = np.ones(n_samples) - for estimator in [KMeans(n_clusters=n_clusters, random_state=42), - MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]: - est_1 = clone(estimator).fit(X) - est_2 = clone(estimator).fit(X, sample_weight=0.5*sample_weight) - assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) - assert_almost_equal(_sort_centers(est_1.cluster_centers_), - _sort_centers(est_2.cluster_centers_)) +def test_kmeans_elkan_iter_attribute(): + # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off + # it's right value (#11340). + km = KMeans(algorithm="elkan", max_iter=1).fit(X) + assert km.n_iter_ == 1 -def test_sample_weight_length(): - # check that an error is raised when passing sample weights - # with an incompatible shape - km = KMeans(n_clusters=n_clusters, random_state=42) - msg = r'sample_weight.shape == \(2,\), expected \(100,\)' - with pytest.raises(ValueError, match=msg): - km.fit(X, sample_weight=np.ones(2)) +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("algo", ["full", "elkan"]) +def test_kmeans_relocated_clusters(array_constr, algo): + # check that empty clusters are relocated as expected + X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) + # second center too far from others points will be empty at first iter + init_centers = np.array([[0.5, 0.5], [3, 3]]) -def test_check_normalize_sample_weight(): - from sklearn.cluster._kmeans import _check_normalize_sample_weight - sample_weight = None - checked_sample_weight = _check_normalize_sample_weight(sample_weight, X) - assert _num_samples(X) == _num_samples(checked_sample_weight) - assert_almost_equal(checked_sample_weight.sum(), _num_samples(X)) - assert X.dtype == checked_sample_weight.dtype + expected_labels = [0, 0, 1, 1] + expected_inertia = 0.25 + expected_centers = [[0.25, 0], [0.75, 1]] + expected_n_iter = 3 + kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) + kmeans.fit(X) -def test_iter_attribute(): - # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off - # it's right value (#11340). - estimator = KMeans(algorithm="elkan", max_iter=1) - estimator.fit(np.random.rand(10, 10)) - assert estimator.n_iter_ == 1 + assert_array_equal(kmeans.labels_, expected_labels) + assert_allclose(kmeans.inertia_, expected_inertia) + assert_allclose(kmeans.cluster_centers_, expected_centers) + assert kmeans.n_iter_ == expected_n_iter -def test_k_means_empty_cluster_relocated(): +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +def test_k_means_empty_cluster_relocated(array_constr): # check that empty clusters are correctly relocated when using sample # weights (#13486) - X = np.array([[-1], [1]]) + X = array_constr([[-1], [1]]) sample_weight = [1.9, 0.1] init = np.array([[-1], [10]]) @@ -1026,109 +529,48 @@ def test_k_means_empty_cluster_relocated(): assert_allclose(km.cluster_centers_, [[-1], [1]]) -def test_minibatch_kmeans_partial_fit_int_data(): - # Issue GH #14314 - X = np.array([[-1], [1]], dtype=np.int) - km = MiniBatchKMeans(n_clusters=2) - km.partial_fit(X) - assert km.cluster_centers_.dtype.kind == "f" - - -def test_result_of_kmeans_equal_in_diff_n_threads(): - # Check that KMeans gives the same results in parallel mode than in - # sequential mode. - rnd = np.random.RandomState(0) - X = rnd.normal(size=(50, 10)) - - with threadpool_limits(limits=1, user_api="openmp"): - result_1 = KMeans( - n_clusters=3, random_state=0).fit(X).labels_ - with threadpool_limits(limits=2, user_api="openmp"): - result_2 = KMeans( - n_clusters=3, random_state=0).fit(X).labels_ - assert_array_equal(result_1, result_2) - - -@pytest.mark.parametrize("precompute_distances", ["auto", False, True]) -def test_precompute_distance_deprecated(precompute_distances): - # FIXME: remove in 0.25 - depr_msg = ("'precompute_distances' was deprecated in version 0.23 and " - "will be removed in 0.25.") - X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) - kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, - precompute_distances=precompute_distances) - - with pytest.warns(FutureWarning, match=depr_msg): - kmeans.fit(X) - - -@pytest.mark.parametrize("n_jobs", [None, 1]) -def test_n_jobs_deprecated(n_jobs): - # FIXME: remove in 0.25 - depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed " - "in 0.25.") - X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) - kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, - n_jobs=n_jobs) - - with pytest.warns(FutureWarning, match=depr_msg): - kmeans.fit(X) - - -def test_warning_elkan_1_cluster(): - X, _ = make_blobs(n_samples=10, n_features=2, centers=1, random_state=0) - kmeans = KMeans(n_clusters=1, n_init=1, init='random', random_state=0, - algorithm='elkan') - - with pytest.warns(RuntimeWarning, - match="algorithm='elkan' doesn't make sense for a single" - " cluster"): - kmeans.fit(X) - - -def test_error_wrong_algorithm(): - X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) - kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, - algorithm='wrong') - - with pytest.raises(ValueError, - match="Algorithm must be 'auto', 'full' or 'elkan'"): - kmeans.fit(X) - +@pytest.mark.parametrize("representation", ["dense", "sparse"]) +def test_relocate_empty_clusters(representation): + # test for the _relocate_empty_clusters_(dense/sparse) helpers -@pytest.mark.parametrize("array_constr", - [np.array, sp.csr_matrix], - ids=['dense', 'sparse']) -@pytest.mark.parametrize("algo", ['full', 'elkan']) -def test_k_means_1_iteration(array_constr, algo): - # check the results after a single iteration (E-step M-step E-step) by - # comparing against a pure python implementation. - X = np.random.RandomState(0).uniform(size=(100, 5)) - init_centers = X[:5] - X = array_constr(X) + # Synthetic dataset with 3 obvious clusters of different sizes + X = np.array( + [-10., -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1) + if representation == "sparse": + X = sp.csr_matrix(X) + sample_weight = np.ones(10) - def py_kmeans(X, init): - new_centers = init.copy() - labels = pairwise_distances_argmin(X, init) - for label in range(init.shape[0]): - new_centers[label] = X[labels == label].mean(axis=0) - labels = pairwise_distances_argmin(X, new_centers) - return labels, new_centers + # centers all initialized to the first point of X + centers_old = np.array([-10., -10, -10]).reshape(-1, 1) - py_labels, py_centers = py_kmeans(X, init_centers) + # With this initialization, all points will be assigned to the first center + # At this point a center in centers_new is the weighted sum of the points + # it contains if it's not empty, otherwise it is the same as before. + centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1) + weight_in_clusters = np.array([10., 0, 0]) + labels = np.zeros(10, dtype=np.int32) - cy_kmeans = KMeans(n_clusters=5, n_init=1, init=init_centers, - algorithm=algo, max_iter=1).fit(X) - cy_labels = cy_kmeans.labels_ - cy_centers = cy_kmeans.cluster_centers_ + if representation == "dense": + _relocate_empty_clusters_dense(X, sample_weight, centers_old, + centers_new, weight_in_clusters, labels) + else: + _relocate_empty_clusters_sparse(X.data, X.indices, X.indptr, + sample_weight, centers_old, + centers_new, weight_in_clusters, + labels) - assert_array_equal(py_labels, cy_labels) - assert_allclose(py_centers, cy_centers) + # The relocation scheme will take the 2 points farthest from the center and + # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The + # first center will be updated to contain the other 8 points. + assert_array_equal(weight_in_clusters, [8, 1, 1]) + assert_allclose(centers_new, [[-36], [10], [9.5]]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("squared", [True, False]) def test_euclidean_distance(dtype, squared): + # Check that the _euclidean_(dense/sparse)_dense helpers produce correct + # results rng = np.random.RandomState(0) a_sparse = sp.random(1, 100, density=0.5, format="csr", random_state=rng, dtype=dtype) @@ -1150,6 +592,7 @@ def test_euclidean_distance(dtype, squared): @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_inertia(dtype): + # Check that the _inertia_(dense/sparse) helpers produce correct results. rng = np.random.RandomState(0) X_sparse = sp.random(100, 10, density=0.5, format="csr", random_state=rng, dtype=dtype) @@ -1167,3 +610,186 @@ def test_inertia(dtype): assert_allclose(inertia_dense, inertia_sparse, rtol=1e-6) assert_allclose(inertia_dense, expected, rtol=1e-6) assert_allclose(inertia_sparse, expected, rtol=1e-6) + + +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_verbose(estimator): + # Check verbose mode of KMeans and MiniBatchKMeans for better coverage. + km = estimator(n_clusters=n_clusters, random_state=42, verbose=1) + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + km.fit(X) + finally: + sys.stdout = old_stdout + + +def test_k_means_function(): + # test calling the k_means function directly + cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, + sample_weight=None) + + assert cluster_centers.shape == (n_clusters, n_features) + assert np.unique(labels).shape[0] == n_clusters + + # check that the labels assignment are perfect (up to a permutation) + assert_allclose(v_measure_score(true_labels, labels), 1.0) + assert inertia > 0.0 + + +def test_minibatch_kmeans_init_size(): + # Check the internal _init_size attribute of MiniBatchKMeans + + # default init size should be 3 * batch_size + km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1).fit(X) + assert km._init_size == 15 + + # if 3 * batch size < n_clusters, it should then be 3 * n_clusters + km = MiniBatchKMeans(n_clusters=10, batch_size=1, n_init=1).fit(X) + assert km._init_size == 30 + + # it should not be larger than n_samples + km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1, + init_size=n_samples + 1).fit(X) + assert km._init_size == n_samples + + +def test_minibatch_kmeans_partial_fit(): + # Check fitting using the partial_fit API + km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42) + + for X_minibatch in np.array_split(X, 10): + km.partial_fit(X_minibatch) + + # compute the labeling on the complete dataset + labels = km.predict(X) + assert_allclose(v_measure_score(true_labels, labels), 1.0) + + +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_wrong_params(estimator): + # Check that error are raised with clear error message when wrong values + # are passed for the parameters + with pytest.raises(ValueError, match="n_init should be > 0"): + estimator(n_init=0).fit(X) + + with pytest.raises(ValueError, match="max_iter should be > 0"): + estimator(max_iter=0).fit(X) + + with pytest.raises(ValueError, + match=r"n_samples.* should be >= n_clusters"): + estimator(n_clusters=n_samples + 1).fit(X) + + with pytest.raises(ValueError, match="tol should be >= 0"): + estimator(tol=-1).fit(X) + + match = (r"The shape of the initial centers .* does not match " + r"the number of clusters") + with pytest.raises(ValueError, match=match): + estimator(init=X[:2]).fit(X) + with pytest.raises(ValueError, match=match): + estimator(init=lambda X_, k, random_state: X_[:2]).fit(X) + + match = (r"The shape of the initial centers .* does not match " + r"the number of features of the data") + with pytest.raises(ValueError, match=match): + estimator(init=X[:8, :2]).fit(X) + with pytest.raises(ValueError, match=match): + estimator(init=lambda X_, k, random_state: X_[:8, :2]).fit(X) + + with pytest.raises(ValueError, + match=r"init should be either 'k-means\+\+', 'random', " + r"a ndarray or a callable"): + estimator(init="wrong").fit(X) + + +def test_kmeans_wrong_params(): + # Check that error are raised with clear error message when wrong values + # are passed for the parameters specific to KMeans + with pytest.raises(ValueError, + match="Algorithm must be 'auto', 'full' or 'elkan'"): + KMeans(algorithm="wrong").fit(X) + + +def test_minibatch_kmeans_wrong_params(): + # Check that error are raised with clear error message when wrong values + # are passed for the parameters specific to MiniBatchKMeans + with pytest.raises(ValueError, match="max_no_improvement should be >= 0"): + MiniBatchKMeans(max_no_improvement=-1).fit(X) + + with pytest.raises(ValueError, match="batch_size should be > 0"): + MiniBatchKMeans(batch_size=-1).fit(X) + + with pytest.raises(ValueError, match="init_size should be > 0"): + MiniBatchKMeans(init_size=-1).fit(X) + + with pytest.raises(ValueError, match="reassignment_ratio should be >= 0"): + MiniBatchKMeans(reassignment_ratio=-1).fit(X) + + +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_warnings(estimator): + # Check warning messages common to KMeans and MiniBatchKMeans + with pytest.warns(RuntimeWarning, + match="Explicit initial center position passed: " + "performing only one init"): + estimator(init=centers, n_clusters=n_clusters).fit(X) + + +def test_kmeans_warnings(): + # Check warning messages specific to KMeans + with pytest.warns(RuntimeWarning, + match="algorithm='elkan' doesn't make sense for a single" + " cluster"): + KMeans(n_clusters=1, algorithm="elkan").fit(X) + + +def test_kmeans_warns_less_centers_than_unique_points(): + # Check KMeans when the number of found clusters is smaller than expected + X = np.asarray([[0, 0], + [0, 1], + [1, 0], + [1, 0]]) # last point is duplicated + km = KMeans(n_clusters=4) + + # KMeans should warn that fewer labels than cluster centers have been used + msg = (r"Number of distinct clusters \(3\) found smaller than " + r"n_clusters \(4\). Possibly due to duplicate points in X.") + with pytest.warns(ConvergenceWarning, match=msg): + km.fit(X) + # only three distinct points, so only three clusters + # can have points assigned to them + assert set(km.labels_) == set(range(3)) + + +def test_minibatch_kmeans_warnings(): + # Check warning messages specific to MiniBatchKMeans + with pytest.warns(RuntimeWarning, + match=r"init_size.* should be larger than n_clusters"): + MiniBatchKMeans(init_size=10, n_clusters=20).fit(X) + + +@pytest.mark.parametrize("precompute_distances", ["auto", False, True]) +def test_precompute_distance_deprecated(precompute_distances): + # FIXME: remove in 0.25 + depr_msg = ("'precompute_distances' was deprecated in version 0.23 and " + "will be removed in 0.25.") + X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) + kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0, + precompute_distances=precompute_distances) + + with pytest.warns(FutureWarning, match=depr_msg): + kmeans.fit(X) + + +@pytest.mark.parametrize("n_jobs", [None, 1]) +def test_n_jobs_deprecated(n_jobs): + # FIXME: remove in 0.25 + depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed " + "in 0.25.") + X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) + kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0, + n_jobs=n_jobs) + + with pytest.warns(FutureWarning, match=depr_msg): + kmeans.fit(X) diff --git a/sklearn/cluster/tests/test_k_means2.py b/sklearn/cluster/tests/test_k_means2.py new file mode 100644 index 0000000000000..4d14c41e42e0d --- /dev/null +++ b/sklearn/cluster/tests/test_k_means2.py @@ -0,0 +1,190 @@ +"""Testing for K-means""" +import sys + +import numpy as np +from scipy import sparse as sp + +from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._testing import assert_almost_equal + +from sklearn.utils.extmath import row_norms +from sklearn.cluster import MiniBatchKMeans +from sklearn.cluster._kmeans import _labels_inertia +from sklearn.cluster._kmeans import _mini_batch_step +from sklearn.datasets import make_blobs +from io import StringIO + + +# non centered, sparse centers to check the +centers = np.array([ + [0.0, 5.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 4.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 5.0, 1.0], +]) +n_samples = 100 +n_clusters, n_features = centers.shape +X, true_labels = make_blobs(n_samples=n_samples, centers=centers, + cluster_std=1., random_state=42) +X_csr = sp.csr_matrix(X) + + +def test_minibatch_update_consistency(): + # Check that dense and sparse minibatch update give the same results + rng = np.random.RandomState(42) + old_centers = centers + rng.normal(size=centers.shape) + + new_centers = old_centers.copy() + new_centers_csr = old_centers.copy() + + weight_sums = np.zeros(new_centers.shape[0], dtype=np.double) + weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double) + + x_squared_norms = (X ** 2).sum(axis=1) + x_squared_norms_csr = row_norms(X_csr, squared=True) + + buffer = np.zeros(centers.shape[1], dtype=np.double) + buffer_csr = np.zeros(centers.shape[1], dtype=np.double) + + # extract a small minibatch + X_mb = X[:10] + X_mb_csr = X_csr[:10] + x_mb_squared_norms = x_squared_norms[:10] + x_mb_squared_norms_csr = x_squared_norms_csr[:10] + + sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double) + + # step 1: compute the dense minibatch update + old_inertia, incremental_diff = _mini_batch_step( + X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums, + buffer, 1, None, random_reassign=False) + assert old_inertia > 0.0 + + # compute the new inertia on the same batch to check that it decreased + labels, new_inertia = _labels_inertia( + X_mb, sample_weight_mb, x_mb_squared_norms, new_centers) + assert new_inertia > 0.0 + assert new_inertia < old_inertia + + # check that the incremental difference computation is matching the + # final observed value + effective_diff = np.sum((new_centers - old_centers) ** 2) + assert_almost_equal(incremental_diff, effective_diff) + + # step 2: compute the sparse minibatch update + old_inertia_csr, incremental_diff_csr = _mini_batch_step( + X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr, + weight_sums_csr, buffer_csr, 1, None, random_reassign=False) + assert old_inertia_csr > 0.0 + + # compute the new inertia on the same batch to check that it decreased + labels_csr, new_inertia_csr = _labels_inertia( + X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr) + assert new_inertia_csr > 0.0 + assert new_inertia_csr < old_inertia_csr + + # check that the incremental difference computation is matching the + # final observed value + effective_diff = np.sum((new_centers_csr - old_centers) ** 2) + assert_almost_equal(incremental_diff_csr, effective_diff) + + # step 3: check that sparse and dense updates lead to the same results + assert_array_equal(labels, labels_csr) + assert_array_almost_equal(new_centers, new_centers_csr) + assert_almost_equal(incremental_diff, incremental_diff_csr) + assert_almost_equal(old_inertia, old_inertia_csr) + assert_almost_equal(new_inertia, new_inertia_csr) + + +def test_minibatch_sensible_reassign_fit(): + # check if identical initial clusters are reassigned + # also a regression test for when there are more desired reassignments than + # samples. + zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, + cluster_std=1., random_state=42) + zeroed_X[::2, :] = 0 + mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, + init="random") + mb_k_means.fit(zeroed_X) + # there should not be too many exact zero cluster centers + assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10 + + # do the same with batch-size > X.shape[0] (regression test) + mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201, + random_state=42, init="random") + mb_k_means.fit(zeroed_X) + # there should not be too many exact zero cluster centers + assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10 + + +def test_minibatch_sensible_reassign_partial_fit(): + zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5, + cluster_std=1., random_state=42) + zeroed_X[::2, :] = 0 + mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random") + for i in range(100): + mb_k_means.partial_fit(zeroed_X) + # there should not be too many exact zero cluster centers + assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10 + + +def test_minibatch_reassign(): + # Give a perfect initialization, but a large reassignment_ratio, + # as a result all the centers should be reassigned and the model + # should no longer be good + sample_weight = np.ones(X.shape[0], dtype=X.dtype) + for this_X in (X, X_csr): + mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, + random_state=42) + mb_k_means.fit(this_X) + + score_before = mb_k_means.score(this_X) + try: + old_stdout = sys.stdout + sys.stdout = StringIO() + # Turn on verbosity to smoke test the display code + _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), + mb_k_means.cluster_centers_, + mb_k_means.counts_, + np.zeros(X.shape[1], np.double), + False, distances=np.zeros(X.shape[0]), + random_reassign=True, random_state=42, + reassignment_ratio=1, verbose=True) + finally: + sys.stdout = old_stdout + assert score_before > mb_k_means.score(this_X) + + # Give a perfect initialization, with a small reassignment_ratio, + # no center should be reassigned + for this_X in (X, X_csr): + mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, + init=centers.copy(), + random_state=42, n_init=1) + mb_k_means.fit(this_X) + clusters_before = mb_k_means.cluster_centers_ + # Turn on verbosity to smoke test the display code + _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), + mb_k_means.cluster_centers_, + mb_k_means.counts_, + np.zeros(X.shape[1], np.double), + False, distances=np.zeros(X.shape[0]), + random_reassign=True, random_state=42, + reassignment_ratio=1e-15) + assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_) + + +def test_minibatch_with_many_reassignments(): + # Test for the case that the number of clusters to reassign is bigger + # than the batch_size + n_samples = 550 + rnd = np.random.RandomState(42) + X = rnd.uniform(size=(n_samples, 10)) + # Check that the fit works if n_clusters is bigger than the batch_size. + # Run the test with 550 clusters and 550 samples, because it turned out + # that this values ensure that the number of clusters to reassign + # is always bigger than the batch_size + n_clusters = 550 + MiniBatchKMeans(n_clusters=n_clusters, + batch_size=100, + init_size=n_samples, + random_state=42).fit(X) From ca728d5f66daa956f1ea1b4bafb56ebe30c79496 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 28 Feb 2020 10:56:25 +0100 Subject: [PATCH 03/72] wip --- sklearn/cluster/_kmeans.py | 254 ++++++++++----------- sklearn/cluster/tests/test_k_means.py | 291 ++++++++++++------------- sklearn/cluster/tests/test_k_means2.py | 17 +- 3 files changed, 273 insertions(+), 289 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index ad9e7eab1ea2c..f9bc7b8875223 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -968,15 +968,12 @@ def fit(self, X, y=None, sample_weight=None): else: kmeans_single = _kmeans_single_elkan - # seeds for the initializations of the kmeans runs. - seeds = random_state.randint(np.iinfo(np.int32).max, size=self._n_init) - - best_labels, best_inertia, best_centers = None, None, None + best_inertia = None # limit number of threads in second level of nested parallelism # (i.e. BLAS) to avoid oversubsciption. with threadpool_limits(limits=1, user_api="blas"): - for seed in seeds: + for i in range(self._n_init): # Initialize centers centers_init = self._init_centroids( X, x_squared_norms=x_squared_norms, init=init, @@ -988,12 +985,12 @@ def fit(self, X, y=None, sample_weight=None): labels, inertia, centers, n_iter_ = kmeans_single( X, sample_weight, centers_init, max_iter=self.max_iter, verbose=self.verbose, tol=self._tol, - x_squared_norms=x_squared_norms, random_state=seed, + x_squared_norms=x_squared_norms, random_state=random_state, n_threads=self._n_threads) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: - best_labels = labels.copy() - best_centers = centers.copy() + best_labels = labels + best_centers = centers best_inertia = inertia best_n_iter = n_iter_ @@ -1152,9 +1149,8 @@ def score(self, X, y=None, sample_weight=None): def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, - old_center_buffer, compute_squared_diff, - distances, random_reassign=False, - random_state=None, reassignment_ratio=.01, + old_center_buffer, compute_squared_diff, random_state, + random_reassign=False, reassignment_ratio=.01, verbose=False): """Incremental update of the centers for the Minibatch K-Means algorithm. @@ -1177,15 +1173,8 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, The vector in which we keep track of the numbers of elements in a cluster. This array is MODIFIED IN PLACE - distances : array, dtype float, shape (n_samples), optional - If not None, should be a pre-allocated array that will be used to store - the distances of each sample to its closest center. - May not be None when random_reassign is True. - - random_state : int, RandomState instance, default=None - Determines random number generation for centroid initialization and to - pick new clusters amongst observations with uniform probability. Use - an int to make the randomness deterministic. + random_state : RandomState instance + Determines random number generation for low count centers reassignment. See :term:`Glossary `. random_reassign : boolean, optional @@ -1218,11 +1207,10 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, """ # Perform label assignment to nearest centers - nearest_center, inertia = _labels_inertia(X, sample_weight, - x_squared_norms, centers) + labels, inertia = _labels_inertia(X, sample_weight, + x_squared_norms, centers) if random_reassign and reassignment_ratio > 0: - random_state = check_random_state(random_state) # Reassign clusters that have very low weight to_reassign = weight_sums < reassignment_ratio * weight_sums.max() # pick at most .5 * batch_size samples as new centers @@ -1256,14 +1244,14 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, if sp.issparse(X): return inertia, _mini_batch_update_csr( X, sample_weight, x_squared_norms, centers, weight_sums, - nearest_center, old_center_buffer, compute_squared_diff) + labels, old_center_buffer, compute_squared_diff) # dense variant in mostly numpy (not as memory efficient though) k = centers.shape[0] squared_diff = 0.0 for center_idx in range(k): # find points from minibatch that are assigned to this center - center_mask = nearest_center == center_idx + center_mask = labels == center_idx wsum = sample_weight[center_mask].sum() if wsum > 0: @@ -1313,26 +1301,24 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol, ewa_diff = centers_squared_diff ewa_inertia = batch_inertia else: - alpha = float(model.batch_size) * 2.0 / (n_samples + 1) - alpha = 1.0 if alpha > 1.0 else alpha + alpha = model.batch_size * 2.0 / (n_samples + 1) + alpha = min(alpha, 1.0) ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha # Log progress to be able to monitor convergence if verbose: - progress_msg = ( - 'Minibatch iteration %d/%d:' - ' mean batch inertia: %f, ewa inertia: %f ' % ( - iteration_idx + 1, n_iter, batch_inertia, - ewa_inertia)) + progress_msg = (f"Minibatch iteration {iteration_idx + 1}/{n_iter}: " + f"mean batch inertia: {batch_inertia}, ewa inertia: " + f"{ewa_inertia}") print(progress_msg) # Early stopping based on absolute tolerance on squared change of # centers position (using EWA smoothing) if tol > 0.0 and ewa_diff <= tol: if verbose: - print('Converged (small centers change) at iteration %d/%d' - % (iteration_idx + 1, n_iter)) + print(f"Converged (small centers change) at iteration " + f"{iteration_idx + 1}/{n_iter}") return True # Early stopping heuristic due to lack of improvement on smoothed inertia @@ -1347,9 +1333,8 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol, if (model.max_no_improvement is not None and no_improvement >= model.max_no_improvement): if verbose: - print('Converged (lack of improvement in inertia)' - ' at iteration %d/%d' - % (iteration_idx + 1, n_iter)) + print(f"Converged (lack of improvement in inertia) at iteration " + f"{iteration_idx}/{n_iter}") return True # update the convergence context to maintain state across successive calls: @@ -1432,7 +1417,8 @@ class MiniBatchKMeans(KMeans): only algorithm is initialized by running a batch KMeans on a random subset of the data. This needs to be larger than n_clusters. - If `None`, `init_size= 3 * batch_size`. + If `None`, the heuristic is `init_size = 3 * batch_size` if + `3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`. n_init : int, default=3 Number of random initializations that are tried. @@ -1458,8 +1444,8 @@ class MiniBatchKMeans(KMeans): inertia_ : float The value of the inertia criterion associated with the chosen partition (if compute_labels is set to True). The inertia is - defined as the sum of square distances of samples to their nearest - neighbor. + defined as the sum of square distances of samples to their cluster + center. See Also -------- @@ -1551,6 +1537,44 @@ def _check_params(self, X): f"reassignment_ratio should be >= 0, got " f"{self.reassignment_ratio} instead.") + def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms, + centers): + """Compute labels and inertia using mini batches. + + This is slightly slower than doing everything at once but preventes + memory errors / segfaults. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data. + + sample_weight : ndarray of shape (n_samples,) + The weights for each observation in X. + + x_squared_norms : ndarray of shape (n_samples,) + Precomputed squared euclidean norm of each data point, to speed up + computations. + + centers : ndarray of shape (n_clusters, n_features) + The cluster centers. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Cluster labels for each point. + + inertia : float + Sum of squared distances of points to nearest cluster. + """ + if self.verbose: + print('Computing label assignment and total inertia') + slices = gen_batches(X.shape[0], self.batch_size) + results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s], + centers) for s in slices] + labels, inertia = zip(*results) + return np.hstack(labels), np.sum(inertia) + def fit(self, X, y=None, sample_weight=None): """Compute the centroids on X by chunking it into mini-batches. @@ -1601,10 +1625,6 @@ def fit(self, X, y=None, sample_weight=None): # disabled old_center_buffer = np.zeros(0, dtype=X.dtype) - distances = np.zeros(self.batch_size, dtype=X.dtype) - n_batches = int(np.ceil(float(n_samples) / self.batch_size)) - n_iter = int(self.max_iter * n_batches) - validation_indices = random_state.randint(0, n_samples, self._init_size) X_valid = X[validation_indices] @@ -1617,25 +1637,14 @@ def fit(self, X, y=None, sample_weight=None): if self.verbose: print(f"Init {init_idx + 1}/{self._n_init} with method {init}") - weight_sums = np.zeros(self.n_clusters, dtype=sample_weight.dtype) - - # TODO: once the `k_means` function works with sparse input we - # should refactor the following init to use it instead. - # Initialize the centers using only a fraction of the data as we - # expect n_samples to be very large when using MiniBatchKMeans + # expect n_samples to be very large when using MiniBatchKMeans. cluster_centers = self._init_centroids( X, x_squared_norms=x_squared_norms, init=init, random_state=random_state, init_size=self._init_size) - # Compute the label assignment on the init dataset - _mini_batch_step( - X_valid, sample_weight_valid, x_squared_norms_valid, - cluster_centers, weight_sums, old_center_buffer, False, - distances=None, verbose=self.verbose) - - # Keep only the best cluster centers across independent inits on - # the common validation set + # Keep the best cluster centers across independent inits based on + # inertia computed on a common validation set. _, inertia = _labels_inertia(X_valid, sample_weight_valid, x_squared_norms_valid, cluster_centers) @@ -1644,33 +1653,40 @@ def fit(self, X, y=None, sample_weight=None): f"{inertia}") if best_inertia is None or inertia < best_inertia: self.cluster_centers_ = cluster_centers - self.counts_ = weight_sums best_inertia = inertia - # Empty context to be used inplace by the convergence check routine + # Initialize counts + self._counts = np.zeros(self.n_clusters, dtype=X.dtype) + + # Empty conext to be used inplace by the convergence check routine convergence_context = {} - # Perform the iterative optimization until the final convergence - # criterion + n_batches = int(np.ceil(float(n_samples) / self.batch_size)) + n_iter = int(self.max_iter * n_batches) + + # Perform the iterative optimization until convergence for iteration_idx in range(n_iter): # Sample a minibatch from the full dataset - minibatch_indices = random_state.randint( - 0, n_samples, self.batch_size) + minibatch_indices = random_state.randint(0, n_samples, + self.batch_size) # Perform the actual update step on the minibatch data batch_inertia, centers_squared_diff = _mini_batch_step( - X[minibatch_indices], sample_weight[minibatch_indices], - x_squared_norms[minibatch_indices], - self.cluster_centers_, self.counts_, - old_center_buffer, self._tol > 0.0, distances=distances, + X=X[minibatch_indices], + sample_weight=sample_weight[minibatch_indices], + x_squared_norms=x_squared_norms[minibatch_indices], + centers=self.cluster_centers_, + weight_sums=self._counts, + old_center_buffer=old_center_buffer, + compute_squared_diff=self._tol > 0.0, + random_state=random_state, # Here we randomly choose whether to perform # random reassignment: the choice is done as a function # of the iteration index, and the minimum number of # counts, in order to force this reassignment to happen # every once in a while random_reassign=((iteration_idx + 1) - % (10 + int(self.counts_.min())) == 0), - random_state=random_state, + % (10 + int(self._counts.min())) == 0), reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) @@ -1684,56 +1700,17 @@ def fit(self, X, y=None, sample_weight=None): self.n_iter_ = iteration_idx + 1 if self.compute_labels: - self.labels_, self.inertia_ = \ - self._labels_inertia_minibatch( - X, sample_weight, x_squared_norms, self.cluster_centers_) + self.labels_, self.inertia_ = self._labels_inertia_minibatch( + X, sample_weight, x_squared_norms, self.cluster_centers_) return self - def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms, - centers): - """Compute labels and inertia using mini batches. - - This is slightly slower than doing everything at once but preventes - memory errors / segfaults. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data. - - sample_weight : ndarray of shape (n_samples,) - The weights for each observation in X. - - x_squared_norms : ndarray of shape (n_samples,) - Precomputed squared euclidean norm of each data point, to speed up - computations. - - centers : ndarray of shape (n_clusters, n_features) - The cluster centers. - - Returns - ------- - labels : ndarray of shape (n_samples,) - Cluster labels for each point. - - inertia : float - Sum of squared distances of points to nearest cluster. - """ - if self.verbose: - print('Computing label assignment and total inertia') - slices = gen_batches(X.shape[0], self.batch_size) - results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s], - centers) for s in slices] - labels, inertia = zip(*results) - return np.hstack(labels), np.sum(inertia) - def partial_fit(self, X, y=None, sample_weight=None): """Update k means estimate on a single mini-batch X. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Coordinates of the data points to cluster. It must be noted that X will be copied if it is not C-contiguous. @@ -1748,42 +1725,47 @@ def partial_fit(self, X, y=None, sample_weight=None): ------- self """ - - X = check_array(X, accept_sparse="csr", order="C", - dtype=[np.float64, np.float32]) + X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], + order='C') n_samples, n_features = X.shape - if hasattr(self.init, '__array__'): - self.init = np.ascontiguousarray(self.init, dtype=X.dtype) if n_samples == 0: return self sample_weight = _check_normalize_sample_weight(sample_weight, X) - x_squared_norms = row_norms(X, squared=True) self._random_state = getattr(self, "_random_state", check_random_state(self.random_state)) - if (not hasattr(self, 'counts_') - or not hasattr(self, 'cluster_centers_')): - # this is the first call partial_fit on this object: + + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + + if not hasattr(self, 'cluster_centers_'): + # this is the first call partial_fit on this object + + # TODO: check batch size and co may be wrong here + self._check_params(X) + + # Validate init array + init = self.init + if hasattr(init, '__array__'): + init = check_array(init, dtype=X.dtype, copy=True, order='C') + # initialize the cluster centers self.cluster_centers_ = self._init_centroids( - X, x_squared_norms=x_squared_norms, init=self.init, - random_state=self._random_state, init_size=self.init_size) - # TODO: should be self._init_size - # Should check params before + X, x_squared_norms=x_squared_norms, init=init, + random_state=self._random_state, init_size=self._init_size) + + # Initialize counts + self._counts = np.zeros(self.n_clusters, dtype=X.dtype) - self.counts_ = np.zeros(self.n_clusters, - dtype=sample_weight.dtype) random_reassign = False - distances = None else: # The lower the minimum count is, the more we do random # reassignment, however, we don't want to do random # reassignment too often, to allow for building up counts random_reassign = self._random_state.randint( - 10 * (1 + self.counts_.min())) == 0 - distances = np.zeros(X.shape[0], dtype=X.dtype) + 10 * (1 + self._counts.min())) == 0 # Raise error if partial_fit called on data with different number # of features. @@ -1792,11 +1774,15 @@ def partial_fit(self, X, y=None, sample_weight=None): f"Number of features {X.shape[1]} does not match previous " f"data {self.cluster_centers_.shape[1]}.") - _mini_batch_step(X, sample_weight, x_squared_norms, - self.cluster_centers_, self.counts_, - np.zeros(0, dtype=X.dtype), 0, - random_reassign=random_reassign, distances=distances, + _mini_batch_step(X, + sample_weight=sample_weight, + x_squared_norms=x_squared_norms, + centers=self.cluster_centers_, + weight_sums=self._counts, + old_center_buffer=np.zeros(0, dtype=X.dtype), + compute_squared_diff=False, random_state=self._random_state, + random_reassign=random_reassign, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index fd48c7b73842c..f98418f037949 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -20,7 +20,6 @@ from sklearn.metrics.cluster import v_measure_score from sklearn.cluster import KMeans, k_means from sklearn.cluster import MiniBatchKMeans -from sklearn.cluster._kmeans import _labels_inertia from sklearn.cluster._kmeans import _mini_batch_step from sklearn.cluster._kmeans import _check_normalize_sample_weight from sklearn.cluster._k_means_fast import _relocate_empty_clusters_dense @@ -60,104 +59,6 @@ def _check_fitted_model(km): assert km.inertia_ > 0.0 -@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], - ids=["dense", "sparse"]) -@pytest.mark.parametrize("algo", ["full", "elkan"]) -@pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_kmeans_results(array_constr, algo, dtype): - # Checks that KMeans works as intended on toy dataset by comparing with - # expected results computed by hand. - X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) - sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] - init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) - - expected_labels = [0, 0, 1, 1] - expected_inertia = 0.1875 - expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) - expected_n_iter = 2 - - kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) - kmeans.fit(X, sample_weight=sample_weight) - - assert_array_equal(kmeans.labels_, expected_labels) - assert_allclose(kmeans.inertia_, expected_inertia) - assert_allclose(kmeans.cluster_centers_, expected_centers) - assert kmeans.n_iter_ == expected_n_iter - - -@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], - ids=["dense", "sparse"]) -@pytest.mark.parametrize("algo", ["full", "elkan"]) -def test_k_means_1_iteration(array_constr, algo): - # check the results after a single iteration (E-step M-step E-step) by - # comparing against a pure python implementation. - X = np.random.RandomState(0).uniform(size=(100, 5)) - init_centers = X[:5] - X = array_constr(X) - - def py_kmeans(X, init): - new_centers = init.copy() - labels = pairwise_distances_argmin(X, init) - for label in range(init.shape[0]): - new_centers[label] = X[labels == label].mean(axis=0) - labels = pairwise_distances_argmin(X, new_centers) - return labels, new_centers - - py_labels, py_centers = py_kmeans(X, init_centers) - - cy_kmeans = KMeans(n_clusters=5, n_init=1, init=init_centers, - algorithm=algo, max_iter=1).fit(X) - cy_labels = cy_kmeans.labels_ - cy_centers = cy_kmeans.cluster_centers_ - - assert_array_equal(py_labels, cy_labels) - assert_allclose(py_centers, cy_centers) - - -@pytest.mark.parametrize("distribution", ["normal", "blobs"]) -@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], - ids=["dense", "sparse"]) -@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8]) -def test_elkan_results(distribution, array_constr, tol): - # Check that results are identical between lloyd and elkan algorithms - rnd = np.random.RandomState(0) - if distribution == "normal": - X = rnd.normal(size=(5000, 10)) - else: - X, _ = make_blobs(random_state=rnd) - X[X < 0] = 0 - X = array_constr(X) - - km_full = KMeans(algorithm="full", n_clusters=5, - random_state=0, n_init=1, tol=tol) - km_elkan = KMeans(algorithm="elkan", n_clusters=5, - random_state=0, n_init=1, tol=tol) - - km_full.fit(X) - km_elkan.fit(X) - assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) - assert_array_equal(km_elkan.labels_, km_full.labels_) - assert km_elkan.n_iter_ == km_full.n_iter_ - assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6) - - -@pytest.mark.parametrize("algorithm", ["full", "elkan"]) -def test_kmeans_convergence(algorithm): - # Check that KMeans stops when convergence is reached when tol=0. (#16075) - # We can only ensure that if the number of threads is not to large, - # otherwise the roundings errors coming from the unpredictability of - # the order in which chunks are processed make the convergence criterion - # to never be exactly 0. - rnd = np.random.RandomState(0) - X = rnd.normal(size=(5000, 10)) - - with threadpool_limits(limits=1, user_api="openmp"): - km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0, - n_init=1, tol=0, max_iter=300).fit(X) - - assert km.n_iter_ < 300 - - @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) @pytest.mark.parametrize("init", ["random", "k-means++", centers, lambda X, k, random_state: centers], @@ -171,7 +72,7 @@ def test_all_init(estimator, data, init): @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_result_of_kmeans_equal_in_diff_n_threads(estimator): +def test_result_equal_in_diff_n_threads(estimator): # Check that KMeans gives the same results in parallel mode than in # sequential mode. rnd = np.random.RandomState(0) @@ -269,17 +170,6 @@ def test_fortran_aligned_data(estimator): assert_array_equal(km_c.labels_, km_f.labels_) -def test_k_means_copyx(): - # Check that copy_x=False returns nearly equal X after de-centering. - my_X = X.copy() - km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42) - km.fit(my_X) - _check_fitted_model(km) - - # check that my_X is de-centered - assert_allclose(my_X, X) - - @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) def test_centers_not_mutated(estimator, dtype): @@ -471,8 +361,129 @@ def test_fit_transform(estimator): assert_allclose(X1, X2) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_verbose(estimator): + # Check verbose mode of KMeans and MiniBatchKMeans for better coverage. + km = estimator(n_clusters=n_clusters, random_state=42, verbose=1) + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + km.fit(X) + finally: + sys.stdout = old_stdout + + +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("algo", ["full", "elkan"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_kmeans_results(array_constr, algo, dtype): + # Checks that KMeans works as intended on toy dataset by comparing with + # expected results computed by hand. + X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) + sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] + init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) + + expected_labels = [0, 0, 1, 1] + expected_inertia = 0.1875 + expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) + expected_n_iter = 2 + + kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) + kmeans.fit(X, sample_weight=sample_weight) + + assert_array_equal(kmeans.labels_, expected_labels) + assert_allclose(kmeans.inertia_, expected_inertia) + assert_allclose(kmeans.cluster_centers_, expected_centers) + assert kmeans.n_iter_ == expected_n_iter + + +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("algo", ["full", "elkan"]) +def test_k_means_1_iteration(array_constr, algo): + # check the results after a single iteration (E-step M-step E-step) by + # comparing against a pure python implementation. + X = np.random.RandomState(0).uniform(size=(100, 5)) + init_centers = X[:5] + X = array_constr(X) + + def py_kmeans(X, init): + new_centers = init.copy() + labels = pairwise_distances_argmin(X, init) + for label in range(init.shape[0]): + new_centers[label] = X[labels == label].mean(axis=0) + labels = pairwise_distances_argmin(X, new_centers) + return labels, new_centers + + py_labels, py_centers = py_kmeans(X, init_centers) + + cy_kmeans = KMeans(n_clusters=5, n_init=1, init=init_centers, + algorithm=algo, max_iter=1).fit(X) + cy_labels = cy_kmeans.labels_ + cy_centers = cy_kmeans.cluster_centers_ + + assert_array_equal(py_labels, cy_labels) + assert_allclose(py_centers, cy_centers) + + +@pytest.mark.parametrize("distribution", ["normal", "blobs"]) +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8]) +def test_kmeans_elkan_results(distribution, array_constr, tol): + # Check that results are identical between lloyd and elkan algorithms + rnd = np.random.RandomState(0) + if distribution == "normal": + X = rnd.normal(size=(5000, 10)) + else: + X, _ = make_blobs(random_state=rnd) + X[X < 0] = 0 + X = array_constr(X) + + km_full = KMeans(algorithm="full", n_clusters=5, + random_state=0, n_init=1, tol=tol) + km_elkan = KMeans(algorithm="elkan", n_clusters=5, + random_state=0, n_init=1, tol=tol) + + km_full.fit(X) + km_elkan.fit(X) + assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) + assert_array_equal(km_elkan.labels_, km_full.labels_) + assert km_elkan.n_iter_ == km_full.n_iter_ + assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6) + + +@pytest.mark.parametrize("algorithm", ["full", "elkan"]) +def test_kmeans_convergence(algorithm): + # Check that KMeans stops when convergence is reached when tol=0. (#16075) + # We can only ensure that if the number of threads is not to large, + # otherwise the roundings errors coming from the unpredictability of + # the order in which chunks are processed make the convergence criterion + # to never be exactly 0. + rnd = np.random.RandomState(0) + X = rnd.normal(size=(5000, 10)) + + with threadpool_limits(limits=1, user_api="openmp"): + km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0, + n_init=1, tol=0, max_iter=300).fit(X) + + assert km.n_iter_ < 300 + + +def test_kmeans_copyx(): + # Check that copy_x=False returns nearly equal X after de-centering. + my_X = X.copy() + km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42) + km.fit(my_X) + _check_fitted_model(km) + + # check that my_X is de-centered + assert_allclose(my_X, X) + + @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) -def test_k_means_init_fitted_centers(data): +def test_kmeans_init_fitted_centers(data): # Check that starting fitting from a local optimum shouldn't change the # solution km1 = KMeans(n_clusters=n_clusters).fit(data) @@ -515,7 +526,7 @@ def test_kmeans_relocated_clusters(array_constr, algo): @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]) -def test_k_means_empty_cluster_relocated(array_constr): +def test_kmeans_empty_cluster_relocated(array_constr): # check that empty clusters are correctly relocated when using sample # weights (#13486) X = array_constr([[-1], [1]]) @@ -612,18 +623,6 @@ def test_inertia(dtype): assert_allclose(inertia_sparse, expected, rtol=1e-6) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_verbose(estimator): - # Check verbose mode of KMeans and MiniBatchKMeans for better coverage. - km = estimator(n_clusters=n_clusters, random_state=42, verbose=1) - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - km.fit(X) - finally: - sys.stdout = old_stdout - - def test_k_means_function(): # test calling the k_means function directly cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, @@ -670,17 +669,17 @@ def test_minibatch_kmeans_partial_fit(): def test_wrong_params(estimator): # Check that error are raised with clear error message when wrong values # are passed for the parameters - with pytest.raises(ValueError, match="n_init should be > 0"): + with pytest.raises(ValueError, match=r"n_init should be > 0"): estimator(n_init=0).fit(X) - with pytest.raises(ValueError, match="max_iter should be > 0"): + with pytest.raises(ValueError, match=r"max_iter should be > 0"): estimator(max_iter=0).fit(X) with pytest.raises(ValueError, match=r"n_samples.* should be >= n_clusters"): estimator(n_clusters=n_samples + 1).fit(X) - with pytest.raises(ValueError, match="tol should be >= 0"): + with pytest.raises(ValueError, match=r"tol should be >= 0"): estimator(tol=-1).fit(X) match = (r"The shape of the initial centers .* does not match " @@ -702,29 +701,27 @@ def test_wrong_params(estimator): r"a ndarray or a callable"): estimator(init="wrong").fit(X) + # specific to KMeans + if estimator is KMeans: + with pytest.raises(ValueError, match=r"Algorithm must be 'auto', " + r"'full' or 'elkan'"): + KMeans(algorithm="wrong").fit(X) -def test_kmeans_wrong_params(): - # Check that error are raised with clear error message when wrong values - # are passed for the parameters specific to KMeans - with pytest.raises(ValueError, - match="Algorithm must be 'auto', 'full' or 'elkan'"): - KMeans(algorithm="wrong").fit(X) - - -def test_minibatch_kmeans_wrong_params(): - # Check that error are raised with clear error message when wrong values - # are passed for the parameters specific to MiniBatchKMeans - with pytest.raises(ValueError, match="max_no_improvement should be >= 0"): - MiniBatchKMeans(max_no_improvement=-1).fit(X) + # specific to MiniBatchKMeans + if estimator is MiniBatchKMeans: + with pytest.raises(ValueError, match=r"max_no_improvement should be " + r">= 0"): + MiniBatchKMeans(max_no_improvement=-1).fit(X) - with pytest.raises(ValueError, match="batch_size should be > 0"): - MiniBatchKMeans(batch_size=-1).fit(X) + with pytest.raises(ValueError, match=r"batch_size should be > 0"): + MiniBatchKMeans(batch_size=-1).fit(X) - with pytest.raises(ValueError, match="init_size should be > 0"): - MiniBatchKMeans(init_size=-1).fit(X) + with pytest.raises(ValueError, match=r"init_size should be > 0"): + MiniBatchKMeans(init_size=-1).fit(X) - with pytest.raises(ValueError, match="reassignment_ratio should be >= 0"): - MiniBatchKMeans(reassignment_ratio=-1).fit(X) + with pytest.raises(ValueError, match=r"reassignment_ratio should be " + r">= 0"): + MiniBatchKMeans(reassignment_ratio=-1).fit(X) @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) diff --git a/sklearn/cluster/tests/test_k_means2.py b/sklearn/cluster/tests/test_k_means2.py index 4d14c41e42e0d..7df2bf1b0efb3 100644 --- a/sklearn/cluster/tests/test_k_means2.py +++ b/sklearn/cluster/tests/test_k_means2.py @@ -57,7 +57,7 @@ def test_minibatch_update_consistency(): # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step( X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums, - buffer, 1, None, random_reassign=False) + buffer, 1, np.random.RandomState(0), random_reassign=False) assert old_inertia > 0.0 # compute the new inertia on the same batch to check that it decreased @@ -74,7 +74,8 @@ def test_minibatch_update_consistency(): # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr, - weight_sums_csr, buffer_csr, 1, None, random_reassign=False) + weight_sums_csr, buffer_csr, 1, np.random.RandomState(0), + random_reassign=False) assert old_inertia_csr > 0.0 # compute the new inertia on the same batch to check that it decreased @@ -145,10 +146,10 @@ def test_minibatch_reassign(): # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), mb_k_means.cluster_centers_, - mb_k_means.counts_, + mb_k_means._counts, np.zeros(X.shape[1], np.double), - False, distances=np.zeros(X.shape[0]), - random_reassign=True, random_state=42, + False, random_state=np.random.RandomState(0), + random_reassign=True, reassignment_ratio=1, verbose=True) finally: sys.stdout = old_stdout @@ -165,10 +166,10 @@ def test_minibatch_reassign(): # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), mb_k_means.cluster_centers_, - mb_k_means.counts_, + mb_k_means._counts, np.zeros(X.shape[1], np.double), - False, distances=np.zeros(X.shape[0]), - random_reassign=True, random_state=42, + False, random_state=np.random.RandomState(0), + random_reassign=True, reassignment_ratio=1e-15) assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_) From b799aebf93ae549d7b9128af0849f58e2c83b215 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 2 Mar 2020 11:39:58 +0100 Subject: [PATCH 04/72] wip --- sklearn/cluster/_kmeans.py | 32 ++++++++------------------- sklearn/cluster/tests/test_k_means.py | 16 +++----------- 2 files changed, 12 insertions(+), 36 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index f9bc7b8875223..6c50eec0a1ee4 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -140,21 +140,6 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, ############################################################################### # K-means batch estimation by EM (expectation maximization) -def _check_normalize_sample_weight(sample_weight, X): - """Set sample_weight if None, and check for correct dtype""" - - sample_weight_was_none = sample_weight is None - - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - if not sample_weight_was_none: - # normalize the weights to sum up to n_samples - # an array of 1 (i.e. samples_weight is None) is already normalized - n_samples = len(sample_weight) - scale = n_samples / sample_weight.sum() - sample_weight *= scale - return sample_weight - - def k_means(X, n_clusters, sample_weight=None, init='k-means++', precompute_distances='deprecated', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, @@ -691,7 +676,8 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): Labels of each point inertia_ : float - Sum of squared distances of samples to their closest cluster center. + Sum of squared distances of samples to their closest cluster center, + weighted by the sample weights if provided. n_iter_ : int Number of iterations run. @@ -940,7 +926,7 @@ def fit(self, X, y=None, sample_weight=None): X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], order='C', copy=self.copy_x, accept_large_sparse=False) - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) random_state = check_random_state(self.random_state) @@ -1113,7 +1099,7 @@ def predict(self, X, sample_weight=None): X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) return _labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_, self._n_threads)[0] @@ -1142,7 +1128,7 @@ def score(self, X, y=None, sample_weight=None): X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) return -_labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_, self._n_threads)[1] @@ -1445,7 +1431,7 @@ class MiniBatchKMeans(KMeans): The value of the inertia criterion associated with the chosen partition (if compute_labels is set to True). The inertia is defined as the sum of square distances of samples to their cluster - center. + center, weighted by the sample weights if provided. See Also -------- @@ -1601,7 +1587,7 @@ def fit(self, X, y=None, sample_weight=None): order='C') n_samples, n_features = X.shape - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) random_state = check_random_state(self.random_state) @@ -1732,7 +1718,7 @@ def partial_fit(self, X, y=None, sample_weight=None): if n_samples == 0: return self - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._random_state = getattr(self, "_random_state", check_random_state(self.random_state)) @@ -1817,7 +1803,7 @@ def predict(self, X, sample_weight=None): X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) - sample_weight = _check_normalize_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) return self._labels_inertia_minibatch( X, sample_weight, x_squared_norms, self.cluster_centers_)[0] diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index f98418f037949..dcc16e904cff7 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -21,7 +21,6 @@ from sklearn.cluster import KMeans, k_means from sklearn.cluster import MiniBatchKMeans from sklearn.cluster._kmeans import _mini_batch_step -from sklearn.cluster._kmeans import _check_normalize_sample_weight from sklearn.cluster._k_means_fast import _relocate_empty_clusters_dense from sklearn.cluster._k_means_fast import _relocate_empty_clusters_sparse from sklearn.cluster._k_means_fast import _euclidean_dense_dense_wrapper @@ -87,16 +86,6 @@ def test_result_equal_in_diff_n_threads(estimator): assert_array_equal(result_1, result_2) -def test_check_normalize_sample_weight(): - # Check the check sample weight helper. sample weights should sum to - # n_samples - sample_weight = None - checked_sample_weight = _check_normalize_sample_weight(sample_weight, X) - assert _num_samples(X) == _num_samples(checked_sample_weight) - assert_almost_equal(checked_sample_weight.sum(), _num_samples(X)) - assert X.dtype == checked_sample_weight.dtype - - def _sort_centers(centers): return np.sort(centers, axis=0) @@ -124,6 +113,7 @@ def test_weighted_vs_repeated(estimator, init): # TODO: FIXME if estimator is not MiniBatchKMeans: + assert_allclose(km_weighted.inertia_, km_repeated.inertia_) assert_allclose(_sort_centers(km_weighted.cluster_centers_), _sort_centers(km_repeated.cluster_centers_)) @@ -381,11 +371,11 @@ def test_kmeans_results(array_constr, algo, dtype): # Checks that KMeans works as intended on toy dataset by comparing with # expected results computed by hand. X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) - sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] + sample_weight = [3, 1, 1, 3] init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) expected_labels = [0, 0, 1, 1] - expected_inertia = 0.1875 + expected_inertia = 0.375 expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) expected_n_iter = 2 From b5b46f407d7b2ddf82cce086688e726a8452ed6d Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 3 Mar 2020 22:55:52 +0100 Subject: [PATCH 05/72] wip --- sklearn/cluster/_k_means_fast.pyx | 96 +++++++++++++++++++++++++++++++ sklearn/cluster/_kmeans.py | 57 ++++++++++++++---- 2 files changed, 143 insertions(+), 10 deletions(-) diff --git a/sklearn/cluster/_k_means_fast.pyx b/sklearn/cluster/_k_means_fast.pyx index 8221b2b15e356..7e9024452e5fd 100644 --- a/sklearn/cluster/_k_means_fast.pyx +++ b/sklearn/cluster/_k_means_fast.pyx @@ -16,6 +16,7 @@ import numpy as np cimport numpy as np cimport cython from cython cimport floating +from cython.parallel cimport prange from libc.math cimport sqrt from ..utils.extmath import row_norms @@ -384,3 +385,98 @@ def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] sample_weight, - centers[center_idx, feature_idx]) ** 2 return squared_diff + + +def _minibatch_update_dense4(np.ndarray[floating, ndim=2, mode='c'] X, + floating[::1] sample_weight, + floating[:, ::1] centers, + floating[::1] weight_sums, + int[::1] labels, + floating[::1] old_center, + bint compute_squared_diff): + cdef: + floating squared_diff = 0 + int n_samples = X.shape[0] + int n_features = X.shape[1] + int i, j, label + floating weight_sum, tmp, lr + + # for i in prange(n_samples, nogil=True): + for i in range(n_samples): + label = labels[i] + + # update center weight + weight_sum = weight_sums[label] + sample_weight[i] + + # learning rate + if weight_sum > 0: + lr = 1 / weight_sum + + if compute_squared_diff: + for j in range(n_features): + old_center[j] = centers[label, j] + + for j in range(n_features): + centers[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j] + + if compute_squared_diff: + for j in range(n_features): + tmp = centers[label, j] - old_center[j] + squared_diff += tmp * tmp + + weight_sums[label] = weight_sum + + return squared_diff + + +def _minibatch_update_dense(np.ndarray[floating, ndim=2, mode='c'] X, + floating[::1] sample_weight, + floating[:, ::1] centers, + floating[::1] weight_sums, + int[::1] labels, + floating[::1] old_center, + bint compute_squared_diff): + cdef: + floating squared_diff = 0 + int n_clusters = centers.shape[0] + int n_samples = X.shape[0] + int n_features = X.shape[1] + int i, j, k + floating wsum, alpha, tmp + + with nogil: + for i in range(n_clusters): + wsum = 0 + for j in prange(n_samples): + if labels[j] == i: + wsum += sample_weight[j] + + if wsum > 0: + if compute_squared_diff: + for k in prange(n_features): + old_center[k] = centers[i, k] + + # inplace remove previous count scaling + for k in prange(n_features): + centers[i, k] = centers[i, k] * weight_sums[i] + + for j in range(n_samples): + if labels[j] == i: + for k in range(n_features): + centers[i, k] = centers[i, k] + X[j, k] * sample_weight[j] + + # update the count statistics for this center + weight_sums[i] = weight_sums[i] + wsum + + # inplace rescale to compute mean of all points (old and new) + alpha = 1 / weight_sums[i] + for k in prange(n_features): + centers[i, k] = centers[i, k] * alpha + + # update the squared diff if necessary + if compute_squared_diff: + for k in prange(n_features): + tmp = centers[i, k] - old_center[k] + squared_diff += tmp * tmp + + return squared_diff \ No newline at end of file diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 6c50eec0a1ee4..87141eed933c5 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -31,6 +31,7 @@ from ._k_means_fast import _inertia_dense from ._k_means_fast import _inertia_sparse from ._k_means_fast import _mini_batch_update_csr +from ._k_means_fast import _minibatch_update_dense from ._k_means_lloyd import _lloyd_iter_chunked_dense from ._k_means_lloyd import _lloyd_iter_chunked_sparse from ._k_means_elkan import _init_bounds_dense @@ -1210,8 +1211,8 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, new_centers = random_state.choice(X.shape[0], replace=False, size=n_reassigns) if verbose: - print("[MiniBatchKMeans] Reassigning %i cluster centers." - % n_reassigns) + print(f"[MiniBatchKMeans] Reassigning {n_reassigns} " + f"cluster centers.") if sp.issparse(X) and not sp.issparse(centers): assign_rows_csr( @@ -1232,10 +1233,17 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, X, sample_weight, x_squared_norms, centers, weight_sums, labels, old_center_buffer, compute_squared_diff) - # dense variant in mostly numpy (not as memory efficient though) - k = centers.shape[0] + # dense variant in mostly numpy (not as memory efficient though. + else: + return inertia, _minibatch_update_dense( + X, sample_weight, centers, weight_sums, labels, + old_center_buffer, compute_squared_diff) + + +def _minibatch_update_dense2(X, sample_weight, centers, weight_sums, labels, + old_center_buffer, compute_squared_diff): squared_diff = 0.0 - for center_idx in range(k): + for center_idx in range(centers.shape[0]): # find points from minibatch that are assigned to this center center_mask = labels == center_idx wsum = sample_weight[center_mask].sum() @@ -1264,8 +1272,34 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, if compute_squared_diff: diff = centers[center_idx].ravel() - old_center_buffer.ravel() squared_diff += np.dot(diff, diff) + + return squared_diff - return inertia, squared_diff + +def _minibatch_update_dense3(X, sample_weight, centers, weight_sums, labels, + old_center_buffer, compute_squared_diff): + squared_diff = 0.0 + for i in range(X.shape[0]): + label = labels[i] + + # update center weight + weight_sums[label] += sample_weight[i] + + # learning rate + if weight_sums[label] > 0: + lr = 1 / weight_sums[label] + + if compute_squared_diff: + old_center_buffer[:] = centers[label] + + centers[label] *= (1 - lr) + centers[label] += lr * X[i] + + if compute_squared_diff: + diff = centers[label].ravel() - old_center_buffer.ravel() + squared_diff += np.dot(diff, diff) + + return squared_diff def _mini_batch_convergence(model, iteration_idx, n_iter, tol, @@ -1433,6 +1467,9 @@ class MiniBatchKMeans(KMeans): defined as the sum of square distances of samples to their cluster center, weighted by the sample weights if provided. + n_iter_ : int + Number of iterations run. + See Also -------- KMeans @@ -1513,7 +1550,8 @@ def _check_params(self, X): elif self._init_size < self.n_clusters: warnings.warn( f"init_size={self._init_size} should be larger than " - f"n_clusters={self.n_clusters}. Setting it to 3*n_clusters", + f"n_clusters={self.n_clusters}. Setting it to " + f"min(3*n_clusters, n_samples)", RuntimeWarning, stacklevel=2) self._init_size = 3 * self.n_clusters self._init_size = min(self._init_size, X.shape[0]) @@ -1713,9 +1751,8 @@ def partial_fit(self, X, y=None, sample_weight=None): """ X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], order='C') - n_samples, n_features = X.shape - if n_samples == 0: + if X.shape[0] == 0: return self sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) @@ -1729,7 +1766,7 @@ def partial_fit(self, X, y=None, sample_weight=None): if not hasattr(self, 'cluster_centers_'): # this is the first call partial_fit on this object - # TODO: check batch size and co may be wrong here + # TODO: should we disable checks of unused params ? self._check_params(X) # Validate init array From 6fb23335b4ad8a421e411e7300bc3713d5271933 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 5 Mar 2020 15:43:03 +0100 Subject: [PATCH 06/72] wip --- sklearn/cluster/_k_means_fast.pyx | 164 +++++++++++++++++------------ sklearn/cluster/_k_means_lloyd.pyx | 2 +- sklearn/cluster/_kmeans.py | 81 +++++++------- 3 files changed, 138 insertions(+), 109 deletions(-) diff --git a/sklearn/cluster/_k_means_fast.pyx b/sklearn/cluster/_k_means_fast.pyx index 7e9024452e5fd..f9ba2245d43c9 100644 --- a/sklearn/cluster/_k_means_fast.pyx +++ b/sklearn/cluster/_k_means_fast.pyx @@ -16,8 +16,9 @@ import numpy as np cimport numpy as np cimport cython from cython cimport floating -from cython.parallel cimport prange +from cython.parallel cimport parallel, prange from libc.math cimport sqrt +from libc.stdlib cimport malloc, free from ..utils.extmath import row_norms @@ -401,82 +402,109 @@ def _minibatch_update_dense4(np.ndarray[floating, ndim=2, mode='c'] X, int i, j, label floating weight_sum, tmp, lr - # for i in prange(n_samples, nogil=True): - for i in range(n_samples): - label = labels[i] - - # update center weight - weight_sum = weight_sums[label] + sample_weight[i] + with nogil: + # for i in prange(n_samples, nogil=True): + for i in range(n_samples): + label = labels[i] - # learning rate - if weight_sum > 0: - lr = 1 / weight_sum + # update center weight + weight_sum = weight_sums[label] + sample_weight[i] - if compute_squared_diff: - for j in range(n_features): - old_center[j] = centers[label, j] + # learning rate + if weight_sum > 0: + lr = 1 / weight_sum - for j in range(n_features): - centers[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j] + if compute_squared_diff: + for j in range(n_features): + old_center[j] = centers[label, j] - if compute_squared_diff: for j in range(n_features): - tmp = centers[label, j] - old_center[j] - squared_diff += tmp * tmp - - weight_sums[label] = weight_sum - + centers[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j] + + if compute_squared_diff: + for j in range(n_features): + tmp = centers[label, j] - old_center[j] + squared_diff += tmp * tmp + + weight_sums[label] = weight_sum + return squared_diff -def _minibatch_update_dense(np.ndarray[floating, ndim=2, mode='c'] X, - floating[::1] sample_weight, - floating[:, ::1] centers, - floating[::1] weight_sums, - int[::1] labels, - floating[::1] old_center, - bint compute_squared_diff): +def _minibatch_update_dense( + np.ndarray[floating, ndim=2, mode='c'] X, + floating[::1] sample_weight, + floating[:, ::1] centers, + floating[:, ::1] centers_new, + floating[::1] weight_sums, + int[::1] labels): + """""" cdef: - floating squared_diff = 0 - int n_clusters = centers.shape[0] int n_samples = X.shape[0] - int n_features = X.shape[1] - int i, j, k - floating wsum, alpha, tmp - - with nogil: - for i in range(n_clusters): - wsum = 0 - for j in prange(n_samples): - if labels[j] == i: - wsum += sample_weight[j] + int n_clusters = centers.shape[0] + int i - if wsum > 0: - if compute_squared_diff: - for k in prange(n_features): - old_center[k] = centers[i, k] - - # inplace remove previous count scaling - for k in prange(n_features): - centers[i, k] = centers[i, k] * weight_sums[i] - - for j in range(n_samples): - if labels[j] == i: - for k in range(n_features): - centers[i, k] = centers[i, k] + X[j, k] * sample_weight[j] - - # update the count statistics for this center - weight_sums[i] = weight_sums[i] + wsum - - # inplace rescale to compute mean of all points (old and new) - alpha = 1 / weight_sums[i] - for k in prange(n_features): - centers[i, k] = centers[i, k] * alpha - - # update the squared diff if necessary - if compute_squared_diff: - for k in prange(n_features): - tmp = centers[i, k] - old_center[k] - squared_diff += tmp * tmp + int *indices - return squared_diff \ No newline at end of file + with nogil, parallel(): + indices = malloc(n_samples * sizeof(int)) + + for i in prange(n_clusters): + update_cluster(i, &X[0, 0], centers, centers_new, labels, + sample_weight, weight_sums, indices) + + free(indices) + + +cdef void update_cluster( + int i, + floating *X, + floating[:, ::1] centers, + floating[:, ::1] centers_new, + int[::1] labels, + floating[::1] sample_weight, + floating[::1] weight_sums, + int *indices) nogil: + """""" + cdef: + int n_samples = sample_weight.shape[0] + int n_features = centers.shape[1] + floating alpha, tmp + int n_indices + int j, k, idx + + floating wsum = 0 + + # indices = np.where(labels == i) + k = 0 + for j in range(n_samples): + if labels[j] == i: + indices[k] = j + k += 1 + n_indices = k + + for j in range(n_indices): + idx = indices[j] + wsum += sample_weight[idx] + + if wsum > 0: + # inplace remove previous count scaling + for k in range(n_features): + centers_new[i, k] = centers[i, k] * weight_sums[i] + + # update cluster with new point members + for j in range(n_indices): + idx = indices[j] + for k in range(n_features): + centers_new[i, k] += X[idx * n_features + k] * sample_weight[idx] + + # update the count statistics for this center + weight_sums[i] += wsum + + # inplace rescale to compute mean of all points (old and new) + alpha = 1 / weight_sums[i] + for k in range(n_features): + centers_new[i, k] *= alpha + else: + for k in range(n_features): + centers_new[i, k] = centers[i, k] diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx index 93e2c6f0b9c89..747c841f6fe11 100644 --- a/sklearn/cluster/_k_means_lloyd.pyx +++ b/sklearn/cluster/_k_means_lloyd.pyx @@ -11,7 +11,7 @@ cimport numpy as np from cython cimport floating from cython.parallel import prange, parallel from libc.stdlib cimport malloc, calloc, free -from libc.string cimport memset, memcpy +from libc.string cimport memset from libc.float cimport DBL_MAX, FLT_MAX from ..utils.extmath import row_norms diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 87141eed933c5..623643e61f511 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -32,6 +32,7 @@ from ._k_means_fast import _inertia_sparse from ._k_means_fast import _mini_batch_update_csr from ._k_means_fast import _minibatch_update_dense +from ._k_means_fast import _minibatch_update_dense4 from ._k_means_lloyd import _lloyd_iter_chunked_dense from ._k_means_lloyd import _lloyd_iter_chunked_sparse from ._k_means_elkan import _init_bounds_dense @@ -1136,7 +1137,7 @@ def score(self, X, y=None, sample_weight=None): def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, - old_center_buffer, compute_squared_diff, random_state, + centers_new, compute_squared_diff, random_state, random_reassign=False, reassignment_ratio=.01, verbose=False): """Incremental update of the centers for the Minibatch K-Means algorithm. @@ -1197,6 +1198,18 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms, centers) + # implementation for the sparse CSR representation completely written in + # cython + if sp.issparse(X): + _mini_batch_update_csr( + X, sample_weight, x_squared_norms, centers, weight_sums, + labels, centers_new, compute_squared_diff) + + # dense variant in mostly numpy (not as memory efficient though. + else: + _minibatch_update_dense( + X, sample_weight, centers, centers_new, weight_sums, labels) + if random_reassign and reassignment_ratio > 0: # Reassign clusters that have very low weight to_reassign = weight_sums < reassignment_ratio * weight_sums.max() @@ -1208,6 +1221,7 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, n_reassigns = to_reassign.sum() if n_reassigns: # Pick new clusters amongst observations with uniform probability + # TODO proba ~ distance like kmeans++ ? new_centers = random_state.choice(X.shape[0], replace=False, size=n_reassigns) if verbose: @@ -1220,63 +1234,42 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, np.where(to_reassign)[0].astype(np.intp, copy=False), centers) else: - centers[to_reassign] = X[new_centers] + centers_new[to_reassign] = X[new_centers] # reset counts of reassigned centers, but don't reset them too small # to avoid instant reassignment. This is a pretty dirty hack as it # also modifies the learning rates. weight_sums[to_reassign] = np.min(weight_sums[~to_reassign]) - # implementation for the sparse CSR representation completely written in - # cython - if sp.issparse(X): - return inertia, _mini_batch_update_csr( - X, sample_weight, x_squared_norms, centers, weight_sums, - labels, old_center_buffer, compute_squared_diff) - - # dense variant in mostly numpy (not as memory efficient though. - else: - return inertia, _minibatch_update_dense( - X, sample_weight, centers, weight_sums, labels, - old_center_buffer, compute_squared_diff) + return inertia, None -def _minibatch_update_dense2(X, sample_weight, centers, weight_sums, labels, - old_center_buffer, compute_squared_diff): - squared_diff = 0.0 - for center_idx in range(centers.shape[0]): +def _minibatch_update_dense3(X, sample_weight, centers, centers_new, + weight_sums, labels): + for i in range(centers.shape[0]): # find points from minibatch that are assigned to this center - center_mask = labels == center_idx - wsum = sample_weight[center_mask].sum() + mask = labels == i + wsum = sample_weight[mask].sum() if wsum > 0: - if compute_squared_diff: - old_center_buffer[:] = centers[center_idx] - # inplace remove previous count scaling - centers[center_idx] *= weight_sums[center_idx] + centers_new[i] = centers[i] * weight_sums[i] # inplace sum with new points members of this cluster - centers[center_idx] += \ - np.sum(X[center_mask] * - sample_weight[center_mask, np.newaxis], axis=0) + centers_new[i] += np.sum( + X[mask] * sample_weight[mask, np.newaxis], axis=0) # update the count statistics for this center - weight_sums[center_idx] += wsum + weight_sums[i] += wsum # inplace rescale to compute mean of all points (old and new) # Note: numpy >= 1.10 does not support '/=' for the following # expression for a mixture of int and float (see numpy issue #6464) - centers[center_idx] = centers[center_idx] / weight_sums[center_idx] - - # update the squared diff if necessary - if compute_squared_diff: - diff = centers[center_idx].ravel() - old_center_buffer.ravel() - squared_diff += np.dot(diff, diff) - - return squared_diff + centers_new[i] /= weight_sums[i] + else: + centers_new[i] = centers[i] -def _minibatch_update_dense3(X, sample_weight, centers, weight_sums, labels, +def _minibatch_update_dense2(X, sample_weight, centers, weight_sums, labels, old_center_buffer, compute_squared_diff): squared_diff = 0.0 for i in range(X.shape[0]): @@ -1676,9 +1669,12 @@ def fit(self, X, y=None, sample_weight=None): print(f"Inertia for init {init_idx + 1}/{self._n_init}: " f"{inertia}") if best_inertia is None or inertia < best_inertia: - self.cluster_centers_ = cluster_centers + init_centers = cluster_centers best_inertia = inertia + centers = init_centers + centers_new = np.empty_like(centers) + # Initialize counts self._counts = np.zeros(self.n_clusters, dtype=X.dtype) @@ -1699,9 +1695,9 @@ def fit(self, X, y=None, sample_weight=None): X=X[minibatch_indices], sample_weight=sample_weight[minibatch_indices], x_squared_norms=x_squared_norms[minibatch_indices], - centers=self.cluster_centers_, + centers=centers, weight_sums=self._counts, - old_center_buffer=old_center_buffer, + centers_new=centers_new, compute_squared_diff=self._tol > 0.0, random_state=random_state, # Here we randomly choose whether to perform @@ -1714,6 +1710,9 @@ def fit(self, X, y=None, sample_weight=None): reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) + centers_squared_diff = np.sum((centers_new - centers)**2) + centers, centers_new = centers_new, centers + # Monitor convergence and do early stopping if necessary if _mini_batch_convergence( self, iteration_idx, n_iter, self._tol, n_samples, @@ -1721,6 +1720,8 @@ def fit(self, X, y=None, sample_weight=None): verbose=self.verbose): break + self.cluster_centers_ = centers + self.n_iter_ = iteration_idx + 1 if self.compute_labels: From bcaa02255dc7e673a2dd637aa3d829c01928403e Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 6 Mar 2020 14:50:39 +0100 Subject: [PATCH 07/72] wip --- ...{_k_means_fast.pxd => _k_means_common.pxd} | 0 ...{_k_means_fast.pyx => _k_means_common.pyx} | 230 ------------ sklearn/cluster/_k_means_elkan.pyx | 12 +- sklearn/cluster/_k_means_lloyd.pyx | 6 +- sklearn/cluster/_k_means_minibatch.pyx | 255 ++++++++++++++ sklearn/cluster/_kmeans.py | 327 ++++++++---------- sklearn/cluster/setup.py | 9 +- sklearn/cluster/tests/test_k_means.py | 27 +- 8 files changed, 428 insertions(+), 438 deletions(-) rename sklearn/cluster/{_k_means_fast.pxd => _k_means_common.pxd} (100%) rename sklearn/cluster/{_k_means_fast.pyx => _k_means_common.pyx} (53%) create mode 100644 sklearn/cluster/_k_means_minibatch.pyx diff --git a/sklearn/cluster/_k_means_fast.pxd b/sklearn/cluster/_k_means_common.pxd similarity index 100% rename from sklearn/cluster/_k_means_fast.pxd rename to sklearn/cluster/_k_means_common.pxd diff --git a/sklearn/cluster/_k_means_fast.pyx b/sklearn/cluster/_k_means_common.pyx similarity index 53% rename from sklearn/cluster/_k_means_fast.pyx rename to sklearn/cluster/_k_means_common.pyx index f9ba2245d43c9..38276a0baa50f 100644 --- a/sklearn/cluster/_k_means_fast.pyx +++ b/sklearn/cluster/_k_means_common.pyx @@ -14,11 +14,8 @@ import numpy as np cimport numpy as np -cimport cython from cython cimport floating -from cython.parallel cimport parallel, prange from libc.math cimport sqrt -from libc.stdlib cimport malloc, free from ..utils.extmath import row_norms @@ -26,10 +23,6 @@ from ..utils.extmath import row_norms np.import_array() -ctypedef np.float64_t DOUBLE -ctypedef np.int32_t INT - - cdef floating _euclidean_dense_dense( floating* a, # IN floating* b, # IN @@ -285,226 +278,3 @@ cdef void _center_shift( for j in range(n_clusters): center_shift[j] = _euclidean_dense_dense( ¢ers_new[j, 0], ¢ers_old[j, 0], n_features, False) - - -def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] sample_weight, - np.ndarray[floating, ndim=1] x_squared_norms, - np.ndarray[floating, ndim=2] centers, - np.ndarray[floating, ndim=1] weight_sums, - np.ndarray[INT, ndim=1] nearest_center, - np.ndarray[floating, ndim=1] old_center, - int compute_squared_diff): - """Incremental update of the centers for sparse MiniBatchKMeans. - - Parameters - ---------- - - X : CSR matrix, dtype float - The complete (pre allocated) training set as a CSR matrix. - - centers : array, shape (n_clusters, n_features) - The cluster centers - - counts : array, shape (n_clusters,) - The vector in which we keep track of the numbers of elements in a - cluster - - Returns - ------- - inertia : float - The inertia of the batch prior to centers update, i.e. the sum - of squared distances to the closest center for each sample. This - is the objective function being minimized by the k-means algorithm. - - squared_diff : float - The sum of squared update (squared norm of the centers position - change). If compute_squared_diff is 0, this computation is skipped and - 0.0 is returned instead. - - Both squared diff and inertia are commonly used to monitor the convergence - of the algorithm. - """ - cdef: - np.ndarray[floating, ndim=1] X_data = X.data - np.ndarray[int, ndim=1] X_indices = X.indices - np.ndarray[int, ndim=1] X_indptr = X.indptr - unsigned int n_samples = X.shape[0] - unsigned int n_clusters = centers.shape[0] - unsigned int n_features = centers.shape[1] - - unsigned int sample_idx, center_idx, feature_idx - unsigned int k - DOUBLE old_weight_sum, new_weight_sum - DOUBLE center_diff - DOUBLE squared_diff = 0.0 - - # move centers to the mean of both old and newly assigned samples - for center_idx in range(n_clusters): - old_weight_sum = weight_sums[center_idx] - new_weight_sum = old_weight_sum - - # count the number of samples assigned to this center - for sample_idx in range(n_samples): - if nearest_center[sample_idx] == center_idx: - new_weight_sum += sample_weight[sample_idx] - - if new_weight_sum == old_weight_sum: - # no new sample: leave this center as it stands - continue - - # rescale the old center to reflect it previous accumulated weight - # with regards to the new data that will be incrementally contributed - if compute_squared_diff: - old_center[:] = centers[center_idx] - centers[center_idx] *= old_weight_sum - - # iterate of over samples assigned to this cluster to move the center - # location by inplace summation - for sample_idx in range(n_samples): - if nearest_center[sample_idx] != center_idx: - continue - - # inplace sum with new samples that are members of this cluster - # and update of the incremental squared difference update of the - # center position - for k in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]): - centers[center_idx, X_indices[k]] += X_data[k] - - # inplace rescale center with updated count - if new_weight_sum > old_weight_sum: - # update the count statistics for this center - weight_sums[center_idx] = new_weight_sum - - # re-scale the updated center with the total new counts - centers[center_idx] /= new_weight_sum - - # update the incremental computation of the squared total - # centers position change - if compute_squared_diff: - for feature_idx in range(n_features): - squared_diff += (old_center[feature_idx] - - centers[center_idx, feature_idx]) ** 2 - - return squared_diff - - -def _minibatch_update_dense4(np.ndarray[floating, ndim=2, mode='c'] X, - floating[::1] sample_weight, - floating[:, ::1] centers, - floating[::1] weight_sums, - int[::1] labels, - floating[::1] old_center, - bint compute_squared_diff): - cdef: - floating squared_diff = 0 - int n_samples = X.shape[0] - int n_features = X.shape[1] - int i, j, label - floating weight_sum, tmp, lr - - with nogil: - # for i in prange(n_samples, nogil=True): - for i in range(n_samples): - label = labels[i] - - # update center weight - weight_sum = weight_sums[label] + sample_weight[i] - - # learning rate - if weight_sum > 0: - lr = 1 / weight_sum - - if compute_squared_diff: - for j in range(n_features): - old_center[j] = centers[label, j] - - for j in range(n_features): - centers[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j] - - if compute_squared_diff: - for j in range(n_features): - tmp = centers[label, j] - old_center[j] - squared_diff += tmp * tmp - - weight_sums[label] = weight_sum - - return squared_diff - - -def _minibatch_update_dense( - np.ndarray[floating, ndim=2, mode='c'] X, - floating[::1] sample_weight, - floating[:, ::1] centers, - floating[:, ::1] centers_new, - floating[::1] weight_sums, - int[::1] labels): - """""" - cdef: - int n_samples = X.shape[0] - int n_clusters = centers.shape[0] - int i - - int *indices - - with nogil, parallel(): - indices = malloc(n_samples * sizeof(int)) - - for i in prange(n_clusters): - update_cluster(i, &X[0, 0], centers, centers_new, labels, - sample_weight, weight_sums, indices) - - free(indices) - - -cdef void update_cluster( - int i, - floating *X, - floating[:, ::1] centers, - floating[:, ::1] centers_new, - int[::1] labels, - floating[::1] sample_weight, - floating[::1] weight_sums, - int *indices) nogil: - """""" - cdef: - int n_samples = sample_weight.shape[0] - int n_features = centers.shape[1] - floating alpha, tmp - int n_indices - int j, k, idx - - floating wsum = 0 - - # indices = np.where(labels == i) - k = 0 - for j in range(n_samples): - if labels[j] == i: - indices[k] = j - k += 1 - n_indices = k - - for j in range(n_indices): - idx = indices[j] - wsum += sample_weight[idx] - - if wsum > 0: - # inplace remove previous count scaling - for k in range(n_features): - centers_new[i, k] = centers[i, k] * weight_sums[i] - - # update cluster with new point members - for j in range(n_indices): - idx = indices[j] - for k in range(n_features): - centers_new[i, k] += X[idx * n_features + k] * sample_weight[idx] - - # update the count statistics for this center - weight_sums[i] += wsum - - # inplace rescale to compute mean of all points (old and new) - alpha = 1 / weight_sums[i] - for k in range(n_features): - centers_new[i, k] *= alpha - else: - for k in range(n_features): - centers_new[i, k] = centers[i, k] diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx index e95c8fe0490a4..d4a392a7d2d6d 100644 --- a/sklearn/cluster/_k_means_elkan.pyx +++ b/sklearn/cluster/_k_means_elkan.pyx @@ -18,12 +18,12 @@ from libc.stdlib cimport calloc, free from libc.string cimport memset, memcpy from ..utils.extmath import row_norms -from ._k_means_fast cimport _relocate_empty_clusters_dense -from ._k_means_fast cimport _relocate_empty_clusters_sparse -from ._k_means_fast cimport _euclidean_dense_dense -from ._k_means_fast cimport _euclidean_sparse_dense -from ._k_means_fast cimport _average_centers -from ._k_means_fast cimport _center_shift +from ._k_means_common cimport _relocate_empty_clusters_dense +from ._k_means_common cimport _relocate_empty_clusters_sparse +from ._k_means_common cimport _euclidean_dense_dense +from ._k_means_common cimport _euclidean_sparse_dense +from ._k_means_common cimport _average_centers +from ._k_means_common cimport _center_shift np.import_array() diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx index 747c841f6fe11..00377ae098458 100644 --- a/sklearn/cluster/_k_means_lloyd.pyx +++ b/sklearn/cluster/_k_means_lloyd.pyx @@ -17,9 +17,9 @@ from libc.float cimport DBL_MAX, FLT_MAX from ..utils.extmath import row_norms from ..utils._cython_blas cimport _gemm from ..utils._cython_blas cimport RowMajor, Trans, NoTrans -from ._k_means_fast cimport _relocate_empty_clusters_dense -from ._k_means_fast cimport _relocate_empty_clusters_sparse -from ._k_means_fast cimport _average_centers, _center_shift +from ._k_means_common cimport _relocate_empty_clusters_dense +from ._k_means_common cimport _relocate_empty_clusters_sparse +from ._k_means_common cimport _average_centers, _center_shift np.import_array() diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx new file mode 100644 index 0000000000000..5132d219e6466 --- /dev/null +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -0,0 +1,255 @@ +# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True + +# TODO: We still need to use ndarrays instead of typed memoryviews when using +# fused types and when the array may be read-only (for instance when it's +# provided by the user). This is fixed in cython > 0.3. + +cimport numpy as np +from cython cimport floating +from cython.parallel cimport parallel, prange +from libc.math cimport sqrt +from libc.stdlib cimport malloc, free + + +np.import_array() + + +def _minibatch_update_dense( + np.ndarray[floating, ndim=2, mode='c'] X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + int[::1] labels): # IN + """Update of the centers for dense MiniBatchKMeans. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features), dtype=floating + The observations to cluster. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + weight_sums : ndarray of shape (n_clusters,), dtype=floating + Current sums of the accumulated weights for each center. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + """ + cdef: + int n_samples = X.shape[0] + int n_clusters = centers_old.shape[0] + int i + + int *indices + + with nogil, parallel(): + indices = malloc(n_samples * sizeof(int)) + + for i in prange(n_clusters): + update_center_dense(i, &X[0, 0], sample_weight, centers_old, + centers_new, weight_sums, labels, indices) + + free(indices) + + +cdef void update_center_dense( + int i, + floating *X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + int[::1] labels, # IN + int *indices) nogil: # OUT + """Update of a single center for dense MinibatchKMeans""" + cdef: + int n_samples = sample_weight.shape[0] + int n_features = centers_old.shape[1] + floating alpha, tmp + int n_indices + int j, k, idx + + floating wsum = 0 + + # indices = np.where(labels == i)[0] + k = 0 + for j in range(n_samples): + if labels[j] == i: + indices[k] = j + k += 1 + n_indices = k + + for j in range(n_indices): + idx = indices[j] + wsum += sample_weight[idx] + + if wsum > 0: + # Remove previous count scaling + for k in range(n_features): + centers_new[i, k] = centers_old[i, k] * weight_sums[i] + + # Update cluster with new point members + for j in range(n_indices): + idx = indices[j] + for k in range(n_features): + centers_new[i, k] += X[idx * n_features + k] * sample_weight[idx] + + # Update the count statistics for this center + weight_sums[i] += wsum + + # Rescale to compute mean of all points (old and new) + alpha = 1 / weight_sums[i] + for k in range(n_features): + centers_new[i, k] *= alpha + else: + for k in range(n_features): + centers_new[i, k] = centers_old[i, k] + + +def _minibatch_update_sparse( + X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + int[::1] labels): # IN + """Update of the centers for sparse MiniBatchKMeans. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features), dtype=floating + The observations to cluster. Must be in CSR format. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + weight_sums : ndarray of shape (n_clusters,), dtype=floating + Current sums of the accumulated weights for each center. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + """ + cdef: + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + int n_samples = X.shape[0] + int n_clusters = centers_old.shape[0] + int i + + int *indices + + with nogil, parallel(): + indices = malloc(n_samples * sizeof(int)) + + for i in prange(n_clusters): + update_center_sparse(i, X_data, X_indices, X_indptr, sample_weight, + centers_old, centers_new, weight_sums, labels, + indices) + + free(indices) + + +cdef void update_center_sparse( + int i, + floating[::1] X_data, # IN + int[::1] X_indices, # IN + int[::1] X_indptr, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + int[::1] labels, # IN + int *indices) nogil: # OUT + """Update of a single center for sparse MinibatchKMeans""" + cdef: + int n_samples = sample_weight.shape[0] + int n_features = centers_old.shape[1] + floating alpha, tmp + int n_indices + int j, k, idx + + floating wsum = 0 + + # indices = np.where(labels == i)[0] + k = 0 + for j in range(n_samples): + if labels[j] == i: + indices[k] = j + k += 1 + n_indices = k + + for j in range(n_indices): + idx = indices[j] + wsum += sample_weight[idx] + + if wsum > 0: + # Remove previous count scaling + for k in range(n_features): + centers_new[i, k] = centers_old[i, k] * weight_sums[i] + + # Update cluster with new point members + for j in range(n_indices): + idx = indices[j] + for k in range(X_indptr[idx], X_indptr[idx + 1]): + centers_new[i, X_indices[k]] += X_data[k] * sample_weight[idx] + + # Update the count statistics for this center + weight_sums[i] += wsum + + # Rescale to compute mean of all points (old and new) + alpha = 1 / weight_sums[i] + for k in range(n_features): + centers_new[i, k] *= alpha + else: + for k in range(n_features): + centers_new[i, k] = centers_old[i, k] + + +def _minibatch_update_dense4(np.ndarray[floating, ndim=2, mode='c'] X, + floating[::1] sample_weight, + floating[:, ::1] centers, + floating[:, ::1] centers_new, + floating[::1] weight_sums, + int[::1] labels): + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int i, j, label + floating weight_sum, tmp, lr + + # for i in prange(n_samples, nogil=True): + for i in range(n_samples): + label = labels[i] + + # update center weight + weight_sum = weight_sums[label] + sample_weight[i] + + # learning rate + if weight_sum > 0: + lr = 1 / weight_sum + + for j in range(n_features): + centers_new[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j] + else: + centers_new[label, j] = centers[label, j] + + weight_sums[label] = weight_sum \ No newline at end of file diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 623643e61f511..514783eaf3b51 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -28,11 +28,11 @@ from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils._openmp_helpers import _openmp_effective_n_threads from ..exceptions import ConvergenceWarning -from ._k_means_fast import _inertia_dense -from ._k_means_fast import _inertia_sparse -from ._k_means_fast import _mini_batch_update_csr -from ._k_means_fast import _minibatch_update_dense -from ._k_means_fast import _minibatch_update_dense4 +from ._k_means_common import _inertia_dense +from ._k_means_common import _inertia_sparse +from ._k_means_minibatch import _minibatch_update_sparse +from ._k_means_minibatch import _minibatch_update_dense +from ._k_means_minibatch import _minibatch_update_dense4 from ._k_means_lloyd import _lloyd_iter_chunked_dense from ._k_means_lloyd import _lloyd_iter_chunked_sparse from ._k_means_elkan import _init_bounds_dense @@ -1136,89 +1136,80 @@ def score(self, X, y=None, sample_weight=None): self.cluster_centers_, self._n_threads)[1] -def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, - centers_new, compute_squared_diff, random_state, - random_reassign=False, reassignment_ratio=.01, - verbose=False): +def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new, + weight_sums, random_state, random_reassign=False, + reassignment_ratio=0.01, verbose=False): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters ---------- - X : array, shape (n_samples, n_features) - The original data array. + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The original data array. In sparse, must be in CSR format. + + x_squared_norms : ndarray of shape (n_samples,) + Squared euclidean norm of each data point. - sample_weight : array-like, shape (n_samples,) + sample_weight : ndarray of shape (n_samples,) The weights for each observation in X. - x_squared_norms : array, shape (n_samples,) - Squared euclidean norm of each data point. + # TODO better + centers : ndarray of shape (n_clusters, n_features) + The cluster centers. - centers : array, shape (k, n_features) - The cluster centers. This array is MODIFIED IN PLACE + centers_new : ndarray of shape (n_clusters, n_features) + TODO - counts : array, shape (k,) - The vector in which we keep track of the numbers of elements in a - cluster. This array is MODIFIED IN PLACE + weight_sums : ndarray of shape (n_clusters,) + The vector in which we keep track of the numbers of points in a + cluster. This array is modified in place. random_state : RandomState instance Determines random number generation for low count centers reassignment. See :term:`Glossary `. - random_reassign : boolean, optional + random_reassign : boolean, default=False If True, centers with very low counts are randomly reassigned to observations. - reassignment_ratio : float, optional + reassignment_ratio : float, default=0.01 Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more likely to be reassigned, which means that the model will take longer to converge, but should converge in a better clustering. - verbose : bool, optional, default False + verbose : bool, default=False Controls the verbosity. - compute_squared_diff : bool - If set to False, the squared diff computation is skipped. - - old_center_buffer : int - Copy of old centers for monitoring convergence. - Returns ------- inertia : float Sum of squared distances of samples to their closest cluster center. - - squared_diff : numpy array, shape (n_clusters,) - Squared distances between previous and updated cluster centers. - """ # Perform label assignment to nearest centers labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms, centers) - # implementation for the sparse CSR representation completely written in - # cython + # Update centers according to the labels if sp.issparse(X): - _mini_batch_update_csr( - X, sample_weight, x_squared_norms, centers, weight_sums, - labels, centers_new, compute_squared_diff) - - # dense variant in mostly numpy (not as memory efficient though. + _minibatch_update_sparse( + X, sample_weight, centers, centers_new, weight_sums, labels) else: _minibatch_update_dense( X, sample_weight, centers, centers_new, weight_sums, labels) + # Reassign clusters that have very low weight if random_reassign and reassignment_ratio > 0: - # Reassign clusters that have very low weight to_reassign = weight_sums < reassignment_ratio * weight_sums.max() + # pick at most .5 * batch_size samples as new centers if to_reassign.sum() > .5 * X.shape[0]: indices_dont_reassign = \ np.argsort(weight_sums)[int(.5 * X.shape[0]):] to_reassign[indices_dont_reassign] = False n_reassigns = to_reassign.sum() + if n_reassigns: # Pick new clusters amongst observations with uniform probability # TODO proba ~ distance like kmeans++ ? @@ -1228,134 +1219,20 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, print(f"[MiniBatchKMeans] Reassigning {n_reassigns} " f"cluster centers.") - if sp.issparse(X) and not sp.issparse(centers): + if sp.issparse(X): assign_rows_csr( X, new_centers.astype(np.intp, copy=False), np.where(to_reassign)[0].astype(np.intp, copy=False), centers) else: centers_new[to_reassign] = X[new_centers] + # reset counts of reassigned centers, but don't reset them too small # to avoid instant reassignment. This is a pretty dirty hack as it # also modifies the learning rates. weight_sums[to_reassign] = np.min(weight_sums[~to_reassign]) - return inertia, None - - -def _minibatch_update_dense3(X, sample_weight, centers, centers_new, - weight_sums, labels): - for i in range(centers.shape[0]): - # find points from minibatch that are assigned to this center - mask = labels == i - wsum = sample_weight[mask].sum() - - if wsum > 0: - # inplace remove previous count scaling - centers_new[i] = centers[i] * weight_sums[i] - - # inplace sum with new points members of this cluster - centers_new[i] += np.sum( - X[mask] * sample_weight[mask, np.newaxis], axis=0) - - # update the count statistics for this center - weight_sums[i] += wsum - - # inplace rescale to compute mean of all points (old and new) - # Note: numpy >= 1.10 does not support '/=' for the following - # expression for a mixture of int and float (see numpy issue #6464) - centers_new[i] /= weight_sums[i] - else: - centers_new[i] = centers[i] - - -def _minibatch_update_dense2(X, sample_weight, centers, weight_sums, labels, - old_center_buffer, compute_squared_diff): - squared_diff = 0.0 - for i in range(X.shape[0]): - label = labels[i] - - # update center weight - weight_sums[label] += sample_weight[i] - - # learning rate - if weight_sums[label] > 0: - lr = 1 / weight_sums[label] - - if compute_squared_diff: - old_center_buffer[:] = centers[label] - - centers[label] *= (1 - lr) - centers[label] += lr * X[i] - - if compute_squared_diff: - diff = centers[label].ravel() - old_center_buffer.ravel() - squared_diff += np.dot(diff, diff) - - return squared_diff - - -def _mini_batch_convergence(model, iteration_idx, n_iter, tol, - n_samples, centers_squared_diff, batch_inertia, - context, verbose=0): - """Helper function to encapsulate the early stopping logic""" - # Normalize inertia to be able to compare values when - # batch_size changes - batch_inertia /= model.batch_size - centers_squared_diff /= model.batch_size - - # Compute an Exponentially Weighted Average of the squared - # diff to monitor the convergence while discarding - # minibatch-local stochastic variability: - # https://en.wikipedia.org/wiki/Moving_average - ewa_diff = context.get('ewa_diff') - ewa_inertia = context.get('ewa_inertia') - if ewa_diff is None: - ewa_diff = centers_squared_diff - ewa_inertia = batch_inertia - else: - alpha = model.batch_size * 2.0 / (n_samples + 1) - alpha = min(alpha, 1.0) - ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha - ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha - - # Log progress to be able to monitor convergence - if verbose: - progress_msg = (f"Minibatch iteration {iteration_idx + 1}/{n_iter}: " - f"mean batch inertia: {batch_inertia}, ewa inertia: " - f"{ewa_inertia}") - print(progress_msg) - - # Early stopping based on absolute tolerance on squared change of - # centers position (using EWA smoothing) - if tol > 0.0 and ewa_diff <= tol: - if verbose: - print(f"Converged (small centers change) at iteration " - f"{iteration_idx + 1}/{n_iter}") - return True - - # Early stopping heuristic due to lack of improvement on smoothed inertia - ewa_inertia_min = context.get('ewa_inertia_min') - no_improvement = context.get('no_improvement', 0) - if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min: - no_improvement = 0 - ewa_inertia_min = ewa_inertia - else: - no_improvement += 1 - - if (model.max_no_improvement is not None - and no_improvement >= model.max_no_improvement): - if verbose: - print(f"Converged (lack of improvement in inertia) at iteration " - f"{iteration_idx}/{n_iter}") - return True - - # update the convergence context to maintain state across successive calls: - context['ewa_diff'] = ewa_diff - context['ewa_inertia'] = ewa_inertia - context['ewa_inertia_min'] = ewa_inertia_min - context['no_improvement'] = no_improvement - return False + return inertia class MiniBatchKMeans(KMeans): @@ -1592,6 +1469,78 @@ def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms, labels, inertia = zip(*results) return np.hstack(labels), np.sum(inertia) + def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, + centers_squared_diff, batch_inertia): + """Helper function to encapsulate the early stopping logic""" + # Normalize inertia to be able to compare values when + # batch_size changes + batch_inertia /= self.batch_size + centers_squared_diff /= self.batch_size + + # We skip the first iteration because it would lead to a bad + # initialization of ewa_diff and ewa_inertia. The reason is that + # inertia is computed on centers before they are updated. Before the + # first iteration, centers are not yet the mean of their cluster. + if iteration_idx == 0: + if self.verbose: + print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: " + f"mean batch inertia: {batch_inertia}, ewa inertia: " + f"-") + return False + + # Compute an Exponentially Weighted Average of the squared diff to + # monitor the convergence while discarding minibatch-local stochastic + # variability: https://en.wikipedia.org/wiki/Moving_average + ewa_diff = self._ewa_diff + ewa_inertia = self._ewa_inertia + if ewa_diff is None: + ewa_diff = centers_squared_diff + ewa_inertia = batch_inertia + else: + alpha = self.batch_size * 2.0 / (n_samples + 1) + ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha + ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha + + # Log progress to be able to monitor convergence + if self.verbose: + print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: " + f"mean batch inertia: {batch_inertia}, ewa inertia: " + f"{ewa_inertia}") + + # Early stopping based on absolute tolerance on squared change of + # centers position (using EWA smoothing) + if self._tol > 0.0 and ewa_diff <= self._tol: + if self.verbose: + print(f"Converged (small centers change) at iteration " + f"{iteration_idx + 1}/{n_iter}") + return True + + # Early stopping heuristic due to lack of improvement on smoothed + # inertia + ewa_inertia_min = self._ewa_inertia_min + no_improvement = self._no_improvement + if iteration_idx >= 5: + if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min: + no_improvement = 0 + ewa_inertia_min = ewa_inertia + else: + no_improvement += 1 + + if (self.max_no_improvement is not None + and no_improvement >= self.max_no_improvement): + if self.verbose: + print(f"Converged (lack of improvement in inertia) at " + f"iteration {iteration_idx}/{n_iter}") + return True + + # update the convergence context to maintain state across successive + # calls: + self._ewa_diff = ewa_diff + self._ewa_inertia = ewa_inertia + self._ewa_inertia_min = ewa_inertia_min + self._no_improvement = no_improvement + return False + def fit(self, X, y=None, sample_weight=None): """Compute the centroids on X by chunking it into mini-batches. @@ -1601,6 +1550,8 @@ def fit(self, X, y=None, sample_weight=None): Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. + If a sparse matrix is passed, a copy will be made if it's not in + CSR format. y : Ignored Not used, present here for API consistency by convention. @@ -1632,18 +1583,9 @@ def fit(self, X, y=None, sample_weight=None): # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) - if self._tol > 0.0: - # using tol-based early stopping needs the allocation of a - # dedicated before which can be expensive for high dim data: - # hence we allocate it outside of the main loop - old_center_buffer = np.zeros(n_features, dtype=X.dtype) - else: - # no need for the center buffer if tol-based early stopping is - # disabled - old_center_buffer = np.zeros(0, dtype=X.dtype) - validation_indices = random_state.randint(0, n_samples, - self._init_size) + # self._init_size, + self.batch_size) X_valid = X[validation_indices] sample_weight_valid = sample_weight[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] @@ -1678,51 +1620,55 @@ def fit(self, X, y=None, sample_weight=None): # Initialize counts self._counts = np.zeros(self.n_clusters, dtype=X.dtype) - # Empty conext to be used inplace by the convergence check routine - convergence_context = {} + # Attributes to monitor the convergence + self._ewa_diff = None + self._ewa_inertia = None + self._ewa_inertia_min = None + self._no_improvement = 0 n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) # Perform the iterative optimization until convergence - for iteration_idx in range(n_iter): + for i in range(n_iter): # Sample a minibatch from the full dataset minibatch_indices = random_state.randint(0, n_samples, self.batch_size) + # Here we randomly choose whether to perform random reassignment: + # the choice is done as a function of the iteration index, and the + # minimum number of counts, in order to force this reassignment to + # happen every once in a while. + random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0 + # Perform the actual update step on the minibatch data - batch_inertia, centers_squared_diff = _mini_batch_step( + batch_inertia = _mini_batch_step( X=X[minibatch_indices], - sample_weight=sample_weight[minibatch_indices], x_squared_norms=x_squared_norms[minibatch_indices], + sample_weight=sample_weight[minibatch_indices], centers=centers, - weight_sums=self._counts, centers_new=centers_new, - compute_squared_diff=self._tol > 0.0, + weight_sums=self._counts, random_state=random_state, - # Here we randomly choose whether to perform - # random reassignment: the choice is done as a function - # of the iteration index, and the minimum number of - # counts, in order to force this reassignment to happen - # every once in a while - random_reassign=((iteration_idx + 1) - % (10 + int(self._counts.min())) == 0), + random_reassign=random_reassign, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) - centers_squared_diff = np.sum((centers_new - centers)**2) + if self._tol > 0.0: + centers_squared_diff = np.sum((centers_new - centers)**2) + else: + centers_squared_diff = 0 + centers, centers_new = centers_new, centers # Monitor convergence and do early stopping if necessary - if _mini_batch_convergence( - self, iteration_idx, n_iter, self._tol, n_samples, - centers_squared_diff, batch_inertia, convergence_context, - verbose=self.verbose): + if self._mini_batch_convergence( + i, n_iter, n_samples, centers_squared_diff, batch_inertia): break self.cluster_centers_ = centers - self.n_iter_ = iteration_idx + 1 + self.n_iter_ = i + 1 if self.compute_labels: self.labels_, self.inertia_ = self._labels_inertia_minibatch( @@ -1799,12 +1745,11 @@ def partial_fit(self, X, y=None, sample_weight=None): f"data {self.cluster_centers_.shape[1]}.") _mini_batch_step(X, - sample_weight=sample_weight, x_squared_norms=x_squared_norms, + sample_weight=sample_weight, centers=self.cluster_centers_, + centers_new=self.cluster_centers_, weight_sums=self._counts, - old_center_buffer=np.zeros(0, dtype=X.dtype), - compute_squared_diff=False, random_state=self._random_state, random_reassign=random_reassign, reassignment_ratio=self.reassignment_ratio, diff --git a/sklearn/cluster/setup.py b/sklearn/cluster/setup.py index 48ed25c5c0eaf..9a85541731e5f 100644 --- a/sklearn/cluster/setup.py +++ b/sklearn/cluster/setup.py @@ -25,8 +25,8 @@ def configuration(parent_package='', top_path=None): include_dirs=[numpy.get_include()], libraries=libraries) - config.add_extension('_k_means_fast', - sources=['_k_means_fast.pyx'], + config.add_extension('_k_means_common', + sources=['_k_means_common.pyx'], include_dirs=[numpy.get_include()], libraries=libraries) @@ -40,6 +40,11 @@ def configuration(parent_package='', top_path=None): include_dirs=[numpy.get_include()], libraries=libraries) + config.add_extension('_k_means_minibatch', + sources=['_k_means_minibatch.pyx'], + include_dirs=[numpy.get_include()], + libraries=libraries) + config.add_subpackage('tests') return config diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index dcc16e904cff7..bf4f0e03f829d 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -72,8 +72,8 @@ def test_all_init(estimator, data, init): @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) def test_result_equal_in_diff_n_threads(estimator): - # Check that KMeans gives the same results in parallel mode than in - # sequential mode. + # Check that KMeans/MiniBatchKMeans give the same results in parallel mode + # than in sequential mode. rnd = np.random.RandomState(0) X = rnd.normal(size=(50, 10)) @@ -132,20 +132,34 @@ def test_unit_weights_vs_no_weights(estimator): assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_) +@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_scaled_weights(estimator): +def test_scaled_weights(estimator, data): # Check that scaling all sample weights by a common factor # shouldn't change the result sample_weight = np.random.uniform(n_samples) km = estimator(n_clusters=n_clusters, random_state=42) - km_orig = clone(km).fit(X, sample_weight=sample_weight) - km_scaled = clone(km).fit(X, sample_weight=0.5 * sample_weight) + km_orig = clone(km).fit(data, sample_weight=sample_weight) + km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight) assert_array_equal(km_orig.labels_, km_scaled.labels_) assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_dense_sparse(estimator): + # Check that the results are the same for dense and sparse input. + sample_weight = np.random.RandomState(0).random_sample((n_samples,)) + km_dense = estimator(n_clusters=n_clusters, random_state=0, n_init=1) + km_dense.fit(X, sample_weight=sample_weight) + km_sparse = estimator(n_clusters=n_clusters, random_state=0, n_init=1) + km_sparse.fit(X_csr, sample_weight=sample_weight) + + assert_array_equal(km_dense.labels_, km_sparse.labels_) + assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_) + + @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) def test_fortran_aligned_data(estimator): # Check that KMeans works with fortran-aligned data. @@ -175,9 +189,10 @@ def test_centers_not_mutated(estimator, dtype): assert not np.may_share_memory(km.cluster_centers_, centers) -@pytest.mark.parametrize("data", [X, X_csr], ids=["sparse", "dense"]) +@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) def test_float_precision(estimator, data): + # TODO km = estimator(n_init=1, random_state=0) inertia = {} From 76c3589affb40acceaf349056329a138a32903ca Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 6 Mar 2020 14:52:22 +0100 Subject: [PATCH 08/72] wip --- sklearn/cluster/_kmeans.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 514783eaf3b51..2c45bf8873b1d 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1530,7 +1530,7 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, and no_improvement >= self.max_no_improvement): if self.verbose: print(f"Converged (lack of improvement in inertia) at " - f"iteration {iteration_idx}/{n_iter}") + f"iteration {iteration_idx}/{n_iter}") return True # update the convergence context to maintain state across successive @@ -1584,8 +1584,7 @@ def fit(self, X, y=None, sample_weight=None): x_squared_norms = row_norms(X, squared=True) validation_indices = random_state.randint(0, n_samples, - # self._init_size, - self.batch_size) + self._init_size) X_valid = X[validation_indices] sample_weight_valid = sample_weight[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] From 231542d2f1a0795f824bd8818bfed3382bfedd91 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 6 Mar 2020 15:23:15 +0100 Subject: [PATCH 09/72] wip --- sklearn/cluster/_kmeans.py | 46 ++++++++++---------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 2c45bf8873b1d..bf19e681201be 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -283,8 +283,8 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300, - verbose=False, x_squared_norms=None, - random_state=None, tol=1e-4, n_threads=1): + verbose=False, x_squared_norms=None, tol=1e-4, + n_threads=1): """A single run of k-means lloyd, assumes preparation completed prior. Parameters @@ -307,11 +307,6 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300, x_squared_norms : array-like, default=None Precomputed x_squared_norms. - random_state : int, RandomState instance, default=None - Determines random number generation for centroid initialization. Use - an int to make the randomness deterministic. - See :term:`Glossary `. - tol : float, default=1e-4 Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two consecutive iterations to declare @@ -340,8 +335,6 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300, n_iter : int Number of iterations run. """ - random_state = check_random_state(random_state) - n_samples = X.shape[0] n_clusters = centers_init.shape[0] @@ -406,8 +399,8 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300, def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300, - verbose=False, x_squared_norms=None, - random_state=None, tol=1e-4, n_threads=1): + verbose=False, x_squared_norms=None, tol=1e-4, + n_threads=1): """A single run of k-means lloyd, assumes preparation completed prior. Parameters @@ -430,11 +423,6 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300, x_squared_norms : ndarray of shape(n_samples,), default=None Precomputed x_squared_norms. - random_state : int, RandomState instance or None, default=None - Determines random number generation for centroid initialization. Use - an int to make the randomness deterministic. - See :term:`Glossary `. - tol : float, default=1e-4 Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two consecutive iterations to declare @@ -463,8 +451,6 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300, n_iter : int Number of iterations run. """ - random_state = check_random_state(random_state) - n_clusters = centers_init.shape[0] # Buffers to avoid new allocations at each iteration. @@ -1477,17 +1463,6 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, batch_inertia /= self.batch_size centers_squared_diff /= self.batch_size - # We skip the first iteration because it would lead to a bad - # initialization of ewa_diff and ewa_inertia. The reason is that - # inertia is computed on centers before they are updated. Before the - # first iteration, centers are not yet the mean of their cluster. - if iteration_idx == 0: - if self.verbose: - print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: " - f"mean batch inertia: {batch_inertia}, ewa inertia: " - f"-") - return False - # Compute an Exponentially Weighted Average of the squared diff to # monitor the convergence while discarding minibatch-local stochastic # variability: https://en.wikipedia.org/wiki/Moving_average @@ -1601,11 +1576,14 @@ def fit(self, X, y=None, sample_weight=None): X, x_squared_norms=x_squared_norms, init=init, random_state=random_state, init_size=self._init_size) - # Keep the best cluster centers across independent inits based on - # inertia computed on a common validation set. - _, inertia = _labels_inertia(X_valid, sample_weight_valid, - x_squared_norms_valid, - cluster_centers) + # Preform one iteration of KMeans to make the centers being the + # mean of their cluster. + _, inertia, cluster_centers, _ = _kmeans_single_lloyd( + X=X_valid, x_squared_norms=x_squared_norms_valid, + sample_weight=sample_weight_valid, + centers_init=cluster_centers, max_iter=1, tol=0, + n_threads=self._n_threads) + if self.verbose: print(f"Inertia for init {init_idx + 1}/{self._n_init}: " f"{inertia}") From 3f475f61b203a199816a4bc36d27e26e77bc2d03 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 6 Mar 2020 18:00:48 +0100 Subject: [PATCH 10/72] wip --- sklearn/cluster/_kmeans.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index eb4170dea199d..3193e665544be 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1476,6 +1476,7 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, ewa_inertia = batch_inertia else: alpha = self.batch_size * 2.0 / (n_samples + 1) + alpha = min(alpha, 1) ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha @@ -1497,19 +1498,18 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, # inertia ewa_inertia_min = self._ewa_inertia_min no_improvement = self._no_improvement - if iteration_idx >= 5: - if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min: - no_improvement = 0 - ewa_inertia_min = ewa_inertia - else: - no_improvement += 1 + if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min: + no_improvement = 0 + ewa_inertia_min = ewa_inertia + else: + no_improvement += 1 - if (self.max_no_improvement is not None - and no_improvement >= self.max_no_improvement): - if self.verbose: - print(f"Converged (lack of improvement in inertia) at " - f"iteration {iteration_idx}/{n_iter}") - return True + if (self.max_no_improvement is not None + and no_improvement >= self.max_no_improvement): + if self.verbose: + print(f"Converged (lack of improvement in inertia) at " + f"iteration {iteration_idx}/{n_iter}") + return True # update the convergence context to maintain state across successive # calls: @@ -1542,11 +1542,9 @@ def fit(self, X, y=None, sample_weight=None): ------- self """ - # TODO accept_large_sparse ??? X = self._validate_data(X, accept_sparse='csr', dtype=[np.float64, np.float32], - order='C', copy=self.copy_x, - accept_large_sparse=False) + order='C', accept_large_sparse=False) n_samples, n_features = X.shape From f73077b5133117f6db038b6d9b66fdc3dfbef244 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 9 Mar 2020 19:02:02 +0100 Subject: [PATCH 11/72] wip --- sklearn/cluster/_k_means_common.pyx | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_k_means_common.pyx b/sklearn/cluster/_k_means_common.pyx index 38276a0baa50f..dde6fe01efa61 100644 --- a/sklearn/cluster/_k_means_common.pyx +++ b/sklearn/cluster/_k_means_common.pyx @@ -15,6 +15,7 @@ import numpy as np cimport numpy as np from cython cimport floating +from cython.parallel cimport prange from libc.math cimport sqrt from ..utils.extmath import row_norms @@ -95,7 +96,8 @@ cpdef floating _inertia_dense( np.ndarray[floating, ndim=2, mode='c'] X, # IN floating[::1] sample_weight, # IN floating[:, ::1] centers, # IN - int[::1] labels): # IN + int[::1] labels, # IN + int n_threads): """Compute inertia for dense input data Sum of squared distance between each sample and its assigned center. @@ -108,7 +110,7 @@ cpdef floating _inertia_dense( floating sq_dist = 0.0 floating inertia = 0.0 - for i in range(n_samples): + for i in prange(n_samples, nogil=True, num_threads=n_threads): j = labels[i] sq_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0], n_features, True) @@ -121,7 +123,8 @@ cpdef floating _inertia_sparse( X, # IN floating[::1] sample_weight, # IN floating[:, ::1] centers, # IN - int[::1] labels): # IN + int[::1] labels, # IN + int n_threads): """Compute inertia for sparse input data Sum of squared distance between each sample and its assigned center. @@ -140,7 +143,7 @@ cpdef floating _inertia_sparse( floating[::1] centers_squared_norms = row_norms(centers, squared=True) - for i in range(n_samples): + for i in prange(n_samples, nogil=True, num_threads=n_threads): j = labels[i] sq_dist = _euclidean_sparse_dense( X_data[X_indptr[i]: X_indptr[i + 1]], From 21d5d24cd8158b515ede391a9a38eed88ed62e6b Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 9 Mar 2020 19:23:39 +0100 Subject: [PATCH 12/72] wip --- sklearn/cluster/_kmeans.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 3193e665544be..e1770998d634a 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -375,7 +375,7 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300, kth=1, axis=0)[1] if verbose: - inertia = _inertia(X, sample_weight, centers, labels) + inertia = _inertia(X, sample_weight, centers, labels, n_threads) print(f"Iteration {i}, inertia {inertia}") centers, centers_new = centers_new, centers @@ -394,7 +394,7 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300, lower_bounds, labels, center_shift, n_threads, update_centers=False) - inertia = _inertia(X, sample_weight, centers, labels) + inertia = _inertia(X, sample_weight, centers, labels, n_threads) return labels, inertia, centers, i + 1 @@ -473,7 +473,7 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300, weight_in_clusters, labels, center_shift, n_threads) if verbose: - inertia = _inertia(X, sample_weight, centers, labels) + inertia = _inertia(X, sample_weight, centers, labels, n_threads) print(f"Iteration {i}, inertia {inertia}.") centers, centers_new = centers_new, centers @@ -491,7 +491,7 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300, weight_in_clusters, labels, center_shift, n_threads, update_centers=False) - inertia = _inertia(X, sample_weight, centers, labels) + inertia = _inertia(X, sample_weight, centers, labels, n_threads) return labels, inertia, centers, i + 1 @@ -551,7 +551,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, weight_in_clusters, labels, center_shift, n_threads, update_centers=False) - inertia = _inertia(X, sample_weight, centers, labels) + inertia = _inertia(X, sample_weight, centers, labels, n_threads) return labels, inertia From a5f9cad84a0a7ae8ebafad69434d035b34c4c90d Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 10 Mar 2020 15:01:01 +0100 Subject: [PATCH 13/72] wip --- sklearn/cluster/_k_means_minibatch.pyx | 16 ++++++++ sklearn/cluster/_kmeans.py | 51 +++++++++++++++++++------- sklearn/cluster/tests/test_k_means.py | 39 +++++++------------- 3 files changed, 68 insertions(+), 38 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index 5132d219e6466..3310298696009 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -9,11 +9,27 @@ from cython cimport floating from cython.parallel cimport parallel, prange from libc.math cimport sqrt from libc.stdlib cimport malloc, free +from libc.string cimport memcpy, memset np.import_array() +def _copy_minibatch_to_buffer( + np.ndarray[floating, ndim=2, mode='c'] X, # IN + floating[:, ::1] minibatch_buffer, # OUT + int[::1] indices, # IN + int n_threads): + """""" + cdef: + int n_samples_minibatch = minibatch_buffer.shape[0] + int n_features = minibatch_buffer.shape[1] + int i, j, idx + + for i in prange(n_samples_minibatch, nogil=True, num_threads=n_threads): + memcpy(&minibatch_buffer[i, 0], &X[indices[i], 0], n_features * sizeof(floating)) + + def _minibatch_update_dense( np.ndarray[floating, ndim=2, mode='c'] X, # IN floating[::1] sample_weight, # IN diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index e1770998d634a..6825dea2ae274 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -31,6 +31,7 @@ from ..exceptions import ConvergenceWarning from ._k_means_common import _inertia_dense from ._k_means_common import _inertia_sparse +from ._k_means_minibatch import _copy_minibatch_to_buffer from ._k_means_minibatch import _minibatch_update_sparse from ._k_means_minibatch import _minibatch_update_dense from ._k_means_minibatch import _minibatch_update_dense4 @@ -962,8 +963,7 @@ def fit(self, X, y=None, sample_weight=None): labels, inertia, centers, n_iter_ = kmeans_single( X, sample_weight, centers_init, max_iter=self.max_iter, verbose=self.verbose, tol=self._tol, - x_squared_norms=x_squared_norms, random_state=random_state, - n_threads=self._n_threads) + x_squared_norms=x_squared_norms, n_threads=self._n_threads) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels @@ -1568,6 +1568,9 @@ def fit(self, X, y=None, sample_weight=None): sample_weight_valid = sample_weight[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] + # TODO comment + centers_new = np.empty((self.n_clusters, n_features), dtype=X.dtype) + # perform several inits with random sub-sets best_inertia = None for init_idx in range(self._n_init): @@ -1580,26 +1583,36 @@ def fit(self, X, y=None, sample_weight=None): X, x_squared_norms=x_squared_norms, init=init, random_state=random_state, init_size=self._init_size) - # Preform one iteration of KMeans to make the centers being the - # mean of their cluster. - _, inertia, cluster_centers, _ = _kmeans_single_lloyd( - X=X_valid, x_squared_norms=x_squared_norms_valid, + # # Preform one iteration of KMeans to make the centers being the + # # mean of their cluster. + # labels, inertia, cluster_centers, _ = _kmeans_single_lloyd( + # X=X_valid, x_squared_norms=x_squared_norms_valid, + # sample_weight=sample_weight_valid, + # centers_init=cluster_centers, max_iter=1, tol=0, + # n_threads=self._n_threads) + weight_sums = np.zeros(self.n_clusters, dtype=X.dtype) + + inertia = _mini_batch_step( + X=X_valid, + x_squared_norms=x_squared_norms_valid, sample_weight=sample_weight_valid, - centers_init=cluster_centers, max_iter=1, tol=0, - n_threads=self._n_threads) + centers=cluster_centers, + centers_new=centers_new, + weight_sums=weight_sums, + random_state=random_state) if self.verbose: print(f"Inertia for init {init_idx + 1}/{self._n_init}: " f"{inertia}") if best_inertia is None or inertia < best_inertia: init_centers = cluster_centers + self._counts = weight_sums best_inertia = inertia centers = init_centers - centers_new = np.empty_like(centers) # Initialize counts - self._counts = np.zeros(self.n_clusters, dtype=X.dtype) + # self._counts = np.zeros(self.n_clusters, dtype=X.dtype) # Attributes to monitor the convergence self._ewa_diff = None @@ -1610,11 +1623,22 @@ def fit(self, X, y=None, sample_weight=None): n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) + if not sp.issparse(X): + minibatch_buffer = np.empty((self.batch_size, n_features), + dtype=X.dtype) + # Perform the iterative optimization until convergence for i in range(n_iter): # Sample a minibatch from the full dataset - minibatch_indices = random_state.randint(0, n_samples, - self.batch_size) + minibatch_indices = random_state.randint( + 0, n_samples, self.batch_size).astype(np.int32, copy=False) + + if sp.issparse(X): + X_minibatch = X[minibatch_indices] + else: + X_minibatch = minibatch_buffer + _copy_minibatch_to_buffer(X, minibatch_buffer, + minibatch_indices, self._n_threads) # Here we randomly choose whether to perform random reassignment: # the choice is done as a function of the iteration index, and the @@ -1624,7 +1648,8 @@ def fit(self, X, y=None, sample_weight=None): # Perform the actual update step on the minibatch data batch_inertia = _mini_batch_step( - X=X[minibatch_indices], + X=X_minibatch, + # X=X[minibatch_indices], x_squared_norms=x_squared_norms[minibatch_indices], sample_weight=sample_weight[minibatch_indices], centers=centers, diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index bf4f0e03f829d..0f457cba41d07 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -21,12 +21,12 @@ from sklearn.cluster import KMeans, k_means from sklearn.cluster import MiniBatchKMeans from sklearn.cluster._kmeans import _mini_batch_step -from sklearn.cluster._k_means_fast import _relocate_empty_clusters_dense -from sklearn.cluster._k_means_fast import _relocate_empty_clusters_sparse -from sklearn.cluster._k_means_fast import _euclidean_dense_dense_wrapper -from sklearn.cluster._k_means_fast import _euclidean_sparse_dense_wrapper -from sklearn.cluster._k_means_fast import _inertia_dense -from sklearn.cluster._k_means_fast import _inertia_sparse +from sklearn.cluster._k_means_common import _relocate_empty_clusters_dense +from sklearn.cluster._k_means_common import _relocate_empty_clusters_sparse +from sklearn.cluster._k_means_common import _euclidean_dense_dense_wrapper +from sklearn.cluster._k_means_common import _euclidean_sparse_dense_wrapper +from sklearn.cluster._k_means_common import _inertia_dense +from sklearn.cluster._k_means_common import _inertia_sparse from sklearn.datasets import make_blobs from io import StringIO @@ -124,7 +124,7 @@ def test_unit_weights_vs_no_weights(estimator): # sample weights all equal to one. sample_weight = np.ones(n_samples) - km = estimator(n_clusters=n_clusters, random_state=42) + km = estimator(n_clusters=n_clusters, random_state=42, n_init=1) km_none = clone(km).fit(X, sample_weight=None) km_ones = clone(km).fit(X, sample_weight=sample_weight) @@ -139,7 +139,7 @@ def test_scaled_weights(estimator, data): # shouldn't change the result sample_weight = np.random.uniform(n_samples) - km = estimator(n_clusters=n_clusters, random_state=42) + km = estimator(n_clusters=n_clusters, random_state=42, n_init=1) km_orig = clone(km).fit(data, sample_weight=sample_weight) km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight) @@ -226,19 +226,6 @@ def test_float_precision(estimator, data): assert_array_equal(labels[np.float32], labels[np.float64]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_score_multiple_inits(estimator): - # Check that fitting KMeans or MiniBatchKMeans with multiple inits gives - # better score - X = np.random.RandomState(0).randn(100, 10) - - km1 = estimator(max_iter=10, random_state=42, n_init=1) - s1 = km1.fit(X).score(X) - km2 = estimator(max_iter=10, random_state=42, n_init=10) - s2 = km2.fit(X).score(X) - assert s2 > s1 - - @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) def test_score_max_iter(estimator): # Check that fitting KMeans or MiniBatchKMeans with more iterations gives @@ -361,8 +348,8 @@ def test_transform(estimator): @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) def test_fit_transform(estimator): # Check equivalence between fit.transform and fit_transform - X1 = estimator(n_clusters=n_clusters, random_state=0).fit(X).transform(X) - X2 = estimator(n_clusters=n_clusters, random_state=0).fit_transform(X) + X1 = estimator(random_state=0, n_init=1).fit(X).transform(X) + X2 = estimator(random_state=0, n_init=1).fit_transform(X) assert_allclose(X1, X2) @@ -620,8 +607,10 @@ def test_inertia(dtype): distances = ((X_dense - centers[labels])**2).sum(axis=1) expected = np.sum(distances * sample_weight) - inertia_dense = _inertia_dense(X_dense, sample_weight, centers, labels) - inertia_sparse = _inertia_sparse(X_sparse, sample_weight, centers, labels) + inertia_dense = _inertia_dense( + X_dense, sample_weight, centers, labels, 1) + inertia_sparse = _inertia_sparse( + X_sparse, sample_weight, centers, labels, 1) assert_allclose(inertia_dense, inertia_sparse, rtol=1e-6) assert_allclose(inertia_dense, expected, rtol=1e-6) From c4fb7a815dfe83c30b081603bb8b62161fb97a45 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 10 Mar 2020 15:48:23 +0100 Subject: [PATCH 14/72] wip --- sklearn/cluster/_kmeans.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 6825dea2ae274..bb5bda8ec229b 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1568,9 +1568,6 @@ def fit(self, X, y=None, sample_weight=None): sample_weight_valid = sample_weight[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] - # TODO comment - centers_new = np.empty((self.n_clusters, n_features), dtype=X.dtype) - # perform several inits with random sub-sets best_inertia = None for init_idx in range(self._n_init): @@ -1583,36 +1580,26 @@ def fit(self, X, y=None, sample_weight=None): X, x_squared_norms=x_squared_norms, init=init, random_state=random_state, init_size=self._init_size) - # # Preform one iteration of KMeans to make the centers being the - # # mean of their cluster. - # labels, inertia, cluster_centers, _ = _kmeans_single_lloyd( - # X=X_valid, x_squared_norms=x_squared_norms_valid, - # sample_weight=sample_weight_valid, - # centers_init=cluster_centers, max_iter=1, tol=0, - # n_threads=self._n_threads) - weight_sums = np.zeros(self.n_clusters, dtype=X.dtype) - - inertia = _mini_batch_step( - X=X_valid, - x_squared_norms=x_squared_norms_valid, + # Preform one iteration of KMeans to make the centers being the + # mean of their cluster. + labels, inertia, cluster_centers, _ = _kmeans_single_lloyd( + X=X_valid, x_squared_norms=x_squared_norms_valid, sample_weight=sample_weight_valid, - centers=cluster_centers, - centers_new=centers_new, - weight_sums=weight_sums, - random_state=random_state) + centers_init=cluster_centers, max_iter=1, tol=0, + n_threads=self._n_threads) if self.verbose: print(f"Inertia for init {init_idx + 1}/{self._n_init}: " f"{inertia}") if best_inertia is None or inertia < best_inertia: init_centers = cluster_centers - self._counts = weight_sums best_inertia = inertia centers = init_centers + centers_new = np.empty_like(centers) # Initialize counts - # self._counts = np.zeros(self.n_clusters, dtype=X.dtype) + self._counts = np.zeros(self.n_clusters, dtype=X.dtype) # Attributes to monitor the convergence self._ewa_diff = None From 3713094fd5488e92d5a228b565e9fafc214dfd05 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 10 Mar 2020 16:08:13 +0100 Subject: [PATCH 15/72] wip --- sklearn/cluster/_k_means_minibatch.pyx | 2 +- sklearn/cluster/_kmeans.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index 3310298696009..692c51aa3e8f2 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -9,7 +9,7 @@ from cython cimport floating from cython.parallel cimport parallel, prange from libc.math cimport sqrt from libc.stdlib cimport malloc, free -from libc.string cimport memcpy, memset +from libc.string cimport memcpy np.import_array() diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index bb5bda8ec229b..2aef5b47cbdea 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1627,7 +1627,7 @@ def fit(self, X, y=None, sample_weight=None): _copy_minibatch_to_buffer(X, minibatch_buffer, minibatch_indices, self._n_threads) - # Here we randomly choose whether to perform random reassignment: + # Randomly choose whether to perform random reassignment: # the choice is done as a function of the iteration index, and the # minimum number of counts, in order to force this reassignment to # happen every once in a while. @@ -1636,7 +1636,6 @@ def fit(self, X, y=None, sample_weight=None): # Perform the actual update step on the minibatch data batch_inertia = _mini_batch_step( X=X_minibatch, - # X=X[minibatch_indices], x_squared_norms=x_squared_norms[minibatch_indices], sample_weight=sample_weight[minibatch_indices], centers=centers, From 6a6fbfb7d1bf4987f9811897e650b49acdc66d47 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 10 Mar 2020 16:34:34 +0100 Subject: [PATCH 16/72] wip --- sklearn/cluster/_k_means_minibatch.pyx | 15 --------------- sklearn/cluster/_kmeans.py | 18 +++--------------- 2 files changed, 3 insertions(+), 30 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index 692c51aa3e8f2..54ec96de0abb4 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -15,21 +15,6 @@ from libc.string cimport memcpy np.import_array() -def _copy_minibatch_to_buffer( - np.ndarray[floating, ndim=2, mode='c'] X, # IN - floating[:, ::1] minibatch_buffer, # OUT - int[::1] indices, # IN - int n_threads): - """""" - cdef: - int n_samples_minibatch = minibatch_buffer.shape[0] - int n_features = minibatch_buffer.shape[1] - int i, j, idx - - for i in prange(n_samples_minibatch, nogil=True, num_threads=n_threads): - memcpy(&minibatch_buffer[i, 0], &X[indices[i], 0], n_features * sizeof(floating)) - - def _minibatch_update_dense( np.ndarray[floating, ndim=2, mode='c'] X, # IN floating[::1] sample_weight, # IN diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 2aef5b47cbdea..4ebd3db50a8df 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -31,7 +31,6 @@ from ..exceptions import ConvergenceWarning from ._k_means_common import _inertia_dense from ._k_means_common import _inertia_sparse -from ._k_means_minibatch import _copy_minibatch_to_buffer from ._k_means_minibatch import _minibatch_update_sparse from ._k_means_minibatch import _minibatch_update_dense from ._k_means_minibatch import _minibatch_update_dense4 @@ -1610,22 +1609,11 @@ def fit(self, X, y=None, sample_weight=None): n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) - if not sp.issparse(X): - minibatch_buffer = np.empty((self.batch_size, n_features), - dtype=X.dtype) - # Perform the iterative optimization until convergence for i in range(n_iter): # Sample a minibatch from the full dataset - minibatch_indices = random_state.randint( - 0, n_samples, self.batch_size).astype(np.int32, copy=False) - - if sp.issparse(X): - X_minibatch = X[minibatch_indices] - else: - X_minibatch = minibatch_buffer - _copy_minibatch_to_buffer(X, minibatch_buffer, - minibatch_indices, self._n_threads) + minibatch_indices = random_state.randint(0, n_samples, + self.batch_size) # Randomly choose whether to perform random reassignment: # the choice is done as a function of the iteration index, and the @@ -1635,7 +1623,7 @@ def fit(self, X, y=None, sample_weight=None): # Perform the actual update step on the minibatch data batch_inertia = _mini_batch_step( - X=X_minibatch, + X=X[minibatch_indices], x_squared_norms=x_squared_norms[minibatch_indices], sample_weight=sample_weight[minibatch_indices], centers=centers, From 3be5343c9ff7cdeeb23955ddef98489e6730a8a8 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 10 Mar 2020 16:35:07 +0100 Subject: [PATCH 17/72] wip --- sklearn/cluster/_k_means_minibatch.pyx | 31 -------------------------- sklearn/cluster/_kmeans.py | 1 - 2 files changed, 32 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index 54ec96de0abb4..ce3a62be256d7 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -223,34 +223,3 @@ cdef void update_center_sparse( else: for k in range(n_features): centers_new[i, k] = centers_old[i, k] - - -def _minibatch_update_dense4(np.ndarray[floating, ndim=2, mode='c'] X, - floating[::1] sample_weight, - floating[:, ::1] centers, - floating[:, ::1] centers_new, - floating[::1] weight_sums, - int[::1] labels): - cdef: - int n_samples = X.shape[0] - int n_features = X.shape[1] - int i, j, label - floating weight_sum, tmp, lr - - # for i in prange(n_samples, nogil=True): - for i in range(n_samples): - label = labels[i] - - # update center weight - weight_sum = weight_sums[label] + sample_weight[i] - - # learning rate - if weight_sum > 0: - lr = 1 / weight_sum - - for j in range(n_features): - centers_new[label, j] = centers[label, j] * (1 - lr) + lr * X[i, j] - else: - centers_new[label, j] = centers[label, j] - - weight_sums[label] = weight_sum \ No newline at end of file diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 4ebd3db50a8df..1aad382627f80 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -33,7 +33,6 @@ from ._k_means_common import _inertia_sparse from ._k_means_minibatch import _minibatch_update_sparse from ._k_means_minibatch import _minibatch_update_dense -from ._k_means_minibatch import _minibatch_update_dense4 from ._k_means_lloyd import _lloyd_iter_chunked_dense from ._k_means_lloyd import _lloyd_iter_chunked_sparse from ._k_means_elkan import _init_bounds_dense From 2add01e64484aa6ec78f1dcbd491e591e7e32529 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 22 Apr 2020 12:40:01 +0200 Subject: [PATCH 18/72] wip --- sklearn/cluster/_kmeans.py | 14 ++--- sklearn/cluster/tests/test_k_means.py | 70 ++++++++++++++++++--- sklearn/cluster/tests/test_k_means2.py | 85 ++------------------------ 3 files changed, 71 insertions(+), 98 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 1aad382627f80..46d208a27be71 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1210,7 +1210,7 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new, assign_rows_csr( X, new_centers.astype(np.intp, copy=False), np.where(to_reassign)[0].astype(np.intp, copy=False), - centers) + centers_new) else: centers_new[to_reassign] = X[new_centers] @@ -1462,20 +1462,16 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, # Normalize inertia to be able to compare values when # batch_size changes batch_inertia /= self.batch_size - centers_squared_diff /= self.batch_size - # Compute an Exponentially Weighted Average of the squared diff to + # Compute an Exponentially Weighted Average of the inertia to # monitor the convergence while discarding minibatch-local stochastic # variability: https://en.wikipedia.org/wiki/Moving_average - ewa_diff = self._ewa_diff ewa_inertia = self._ewa_inertia - if ewa_diff is None: - ewa_diff = centers_squared_diff + if ewa_inertia is None: ewa_inertia = batch_inertia else: alpha = self.batch_size * 2.0 / (n_samples + 1) alpha = min(alpha, 1) - ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha # Log progress to be able to monitor convergence @@ -1486,7 +1482,7 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, # Early stopping based on absolute tolerance on squared change of # centers position (using EWA smoothing) - if self._tol > 0.0 and ewa_diff <= self._tol: + if self._tol > 0.0 and centers_squared_diff <= self._tol: if self.verbose: print(f"Converged (small centers change) at iteration " f"{iteration_idx + 1}/{n_iter}") @@ -1511,7 +1507,6 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, # update the convergence context to maintain state across successive # calls: - self._ewa_diff = ewa_diff self._ewa_inertia = ewa_inertia self._ewa_inertia_min = ewa_inertia_min self._no_improvement = no_improvement @@ -1600,7 +1595,6 @@ def fit(self, X, y=None, sample_weight=None): self._counts = np.zeros(self.n_clusters, dtype=X.dtype) # Attributes to monitor the convergence - self._ewa_diff = None self._ewa_inertia = None self._ewa_inertia_min = None self._no_improvement = 0 diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 0f457cba41d07..17ae1967b374c 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -21,6 +21,7 @@ from sklearn.cluster import KMeans, k_means from sklearn.cluster import MiniBatchKMeans from sklearn.cluster._kmeans import _mini_batch_step +from sklearn.cluster._kmeans import _labels_inertia from sklearn.cluster._k_means_common import _relocate_empty_clusters_dense from sklearn.cluster._k_means_common import _relocate_empty_clusters_sparse from sklearn.cluster._k_means_common import _euclidean_dense_dense_wrapper @@ -647,16 +648,69 @@ def test_minibatch_kmeans_init_size(): assert km._init_size == n_samples -def test_minibatch_kmeans_partial_fit(): - # Check fitting using the partial_fit API - km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42) +def test_minibatch_sensible_reassign(): + # check that identical initial clusters are reassigned + # also a regression test for when there are more desired reassignments than + # samples. + zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, + random_state=42) + zeroed_X[::2, :] = 0 - for X_minibatch in np.array_split(X, 10): - km.partial_fit(X_minibatch) + km = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, + init="random").fit(zeroed_X) + # there should not be too many exact zero cluster centers + assert km.cluster_centers_.any(axis=1).sum() > 10 - # compute the labeling on the complete dataset - labels = km.predict(X) - assert_allclose(v_measure_score(true_labels, labels), 1.0) + # do the same with batch-size > X.shape[0] (regression test) + km = MiniBatchKMeans(n_clusters=20, batch_size=200, random_state=42, + init="random").fit(zeroed_X) + # there should not be too many exact zero cluster centers + assert km.cluster_centers_.any(axis=1).sum() > 10 + + # do the same with partial_fit API + km = MiniBatchKMeans(n_clusters=20, random_state=42, init="random") + for i in range(100): + km.partial_fit(zeroed_X) + # there should not be too many exact zero cluster centers + assert km.cluster_centers_.any(axis=1).sum() > 10 + + +@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) +def test_minibatch_reassign(data): + # Check the reassignment part of the minibatch step with very high or very + # low reassignment ratio. + perfect_centers = np.empty((n_clusters, n_features)) + for i in range(n_clusters): + perfect_centers[i] = X[true_labels == i].mean(axis=0) + + x_squared_norms = row_norms(data, squared=True) + sample_weight = np.ones(n_samples) + centers_new = np.empty_like(perfect_centers) + + # Give a perfect initialization, but a large reassignment_ratio, as a + # result many centers should be reassigned and the model should no longer + # be good + score_before = - _labels_inertia(data, sample_weight, x_squared_norms, + perfect_centers, 1)[1] + + _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers, + centers_new, np.zeros(n_clusters), + np.random.RandomState(0), random_reassign=True, + reassignment_ratio=1) + + score_after = - _labels_inertia(data, sample_weight, x_squared_norms, + centers_new, 1)[1] + + assert score_before > score_after + + # Give a perfect initialization, with a small reassignment_ratio, + # no center should be reassigned. + _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers, + centers_new, np.zeros(n_clusters), + np.random.RandomState(0), random_reassign=True, + reassignment_ratio=1e-15) + + assert_allclose(centers_new, perfect_centers) @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) diff --git a/sklearn/cluster/tests/test_k_means2.py b/sklearn/cluster/tests/test_k_means2.py index 7df2bf1b0efb3..93c6d8011be80 100644 --- a/sklearn/cluster/tests/test_k_means2.py +++ b/sklearn/cluster/tests/test_k_means2.py @@ -97,95 +97,20 @@ def test_minibatch_update_consistency(): assert_almost_equal(new_inertia, new_inertia_csr) -def test_minibatch_sensible_reassign_fit(): - # check if identical initial clusters are reassigned - # also a regression test for when there are more desired reassignments than - # samples. - zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, - cluster_std=1., random_state=42) - zeroed_X[::2, :] = 0 - mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, - init="random") - mb_k_means.fit(zeroed_X) - # there should not be too many exact zero cluster centers - assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10 - - # do the same with batch-size > X.shape[0] (regression test) - mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201, - random_state=42, init="random") - mb_k_means.fit(zeroed_X) - # there should not be too many exact zero cluster centers - assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10 - - -def test_minibatch_sensible_reassign_partial_fit(): - zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5, - cluster_std=1., random_state=42) - zeroed_X[::2, :] = 0 - mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random") - for i in range(100): - mb_k_means.partial_fit(zeroed_X) - # there should not be too many exact zero cluster centers - assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10 - - -def test_minibatch_reassign(): - # Give a perfect initialization, but a large reassignment_ratio, - # as a result all the centers should be reassigned and the model - # should no longer be good - sample_weight = np.ones(X.shape[0], dtype=X.dtype) - for this_X in (X, X_csr): - mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, - random_state=42) - mb_k_means.fit(this_X) - - score_before = mb_k_means.score(this_X) - try: - old_stdout = sys.stdout - sys.stdout = StringIO() - # Turn on verbosity to smoke test the display code - _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), - mb_k_means.cluster_centers_, - mb_k_means._counts, - np.zeros(X.shape[1], np.double), - False, random_state=np.random.RandomState(0), - random_reassign=True, - reassignment_ratio=1, verbose=True) - finally: - sys.stdout = old_stdout - assert score_before > mb_k_means.score(this_X) - - # Give a perfect initialization, with a small reassignment_ratio, - # no center should be reassigned - for this_X in (X, X_csr): - mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, - init=centers.copy(), - random_state=42, n_init=1) - mb_k_means.fit(this_X) - clusters_before = mb_k_means.cluster_centers_ - # Turn on verbosity to smoke test the display code - _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), - mb_k_means.cluster_centers_, - mb_k_means._counts, - np.zeros(X.shape[1], np.double), - False, random_state=np.random.RandomState(0), - random_reassign=True, - reassignment_ratio=1e-15) - assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_) - - def test_minibatch_with_many_reassignments(): # Test for the case that the number of clusters to reassign is bigger # than the batch_size - n_samples = 550 + n_samples = 1000 rnd = np.random.RandomState(42) X = rnd.uniform(size=(n_samples, 10)) # Check that the fit works if n_clusters is bigger than the batch_size. # Run the test with 550 clusters and 550 samples, because it turned out # that this values ensure that the number of clusters to reassign # is always bigger than the batch_size - n_clusters = 550 + n_clusters = 1000 MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, init_size=n_samples, - random_state=42).fit(X) + random_state=42, + verbose=True).fit(X) + assert False From 7d7ab15c5a5501bd9893eb9a8d64faeb7a5b8db2 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 15 Jun 2020 17:20:39 +0200 Subject: [PATCH 19/72] wip --- sklearn/cluster/tests/test_k_means.py | 69 +++++++++++++++ sklearn/cluster/tests/test_k_means2.py | 116 ------------------------- 2 files changed, 69 insertions(+), 116 deletions(-) delete mode 100644 sklearn/cluster/tests/test_k_means2.py diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index c26151219303c..c3e94c8622b0a 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -629,6 +629,63 @@ def test_k_means_function(): assert inertia > 0.0 +def test_minibatch_update_consistency(): + # Check that dense and sparse minibatch update give the same results + rng = np.random.RandomState(42) + + centers_old = centers + rng.normal(size=centers.shape) + centers_old_csr = centers_old.copy() + + centers_new = np.zeros_like(centers_old) + centers_new_csr = np.zeros_like(centers_old_csr) + + weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype) + weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype) + + x_squared_norms = (X ** 2).sum(axis=1) + x_squared_norms_csr = row_norms(X_csr, squared=True) + + sample_weight = np.ones(X.shape[0], dtype=X.dtype) + + # extract a small minibatch + X_mb = X[:10] + X_mb_csr = X_csr[:10] + x_mb_squared_norms = x_squared_norms[:10] + x_mb_squared_norms_csr = x_squared_norms_csr[:10] + sample_weight_mb = sample_weight[:10] + + # step 1: compute the dense minibatch update + old_inertia = _mini_batch_step( + X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new, + weight_sums, np.random.RandomState(0), random_reassign=False) + assert old_inertia > 0.0 + + # compute the new inertia on the same batch to check that it decreased + labels, new_inertia = _labels_inertia( + X_mb, sample_weight_mb, x_mb_squared_norms, centers_new) + assert new_inertia > 0.0 + assert new_inertia < old_inertia + + # step 2: compute the sparse minibatch update + old_inertia_csr = _mini_batch_step( + X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr, + centers_new_csr, weight_sums_csr, np.random.RandomState(0), + random_reassign=False) + assert old_inertia_csr > 0.0 + + # compute the new inertia on the same batch to check that it decreased + labels_csr, new_inertia_csr = _labels_inertia( + X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr) + assert new_inertia_csr > 0.0 + assert new_inertia_csr < old_inertia_csr + + # step 3: check that sparse and dense updates lead to the same results + assert_array_equal(labels, labels_csr) + assert_allclose(centers_new, centers_new_csr) + assert_allclose(old_inertia, old_inertia_csr) + assert_allclose(new_inertia, new_inertia_csr) + + def test_minibatch_kmeans_init_size(): # Check the internal _init_size attribute of MiniBatchKMeans @@ -711,6 +768,18 @@ def test_minibatch_reassign(data): assert_allclose(centers_new, perfect_centers) +def test_minibatch_with_many_reassignments(): + # Test for the case that the number of clusters to reassign is bigger + # than the batch_size. Run the test with 100 clusters and a batch_size of + # 10 because it turned out that these values ensure that the number of + # clusters to reassign is always bigger than the batch_size. + MiniBatchKMeans(n_clusters=100, + batch_size=10, + init_size=n_samples, + random_state=42, + verbose=True).fit(X) + + @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) @pytest.mark.parametrize("param, match", [ ({"n_init": 0}, r"n_init should be > 0"), diff --git a/sklearn/cluster/tests/test_k_means2.py b/sklearn/cluster/tests/test_k_means2.py deleted file mode 100644 index 93c6d8011be80..0000000000000 --- a/sklearn/cluster/tests/test_k_means2.py +++ /dev/null @@ -1,116 +0,0 @@ -"""Testing for K-means""" -import sys - -import numpy as np -from scipy import sparse as sp - -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_almost_equal - -from sklearn.utils.extmath import row_norms -from sklearn.cluster import MiniBatchKMeans -from sklearn.cluster._kmeans import _labels_inertia -from sklearn.cluster._kmeans import _mini_batch_step -from sklearn.datasets import make_blobs -from io import StringIO - - -# non centered, sparse centers to check the -centers = np.array([ - [0.0, 5.0, 0.0, 0.0, 0.0], - [1.0, 1.0, 4.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 5.0, 1.0], -]) -n_samples = 100 -n_clusters, n_features = centers.shape -X, true_labels = make_blobs(n_samples=n_samples, centers=centers, - cluster_std=1., random_state=42) -X_csr = sp.csr_matrix(X) - - -def test_minibatch_update_consistency(): - # Check that dense and sparse minibatch update give the same results - rng = np.random.RandomState(42) - old_centers = centers + rng.normal(size=centers.shape) - - new_centers = old_centers.copy() - new_centers_csr = old_centers.copy() - - weight_sums = np.zeros(new_centers.shape[0], dtype=np.double) - weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double) - - x_squared_norms = (X ** 2).sum(axis=1) - x_squared_norms_csr = row_norms(X_csr, squared=True) - - buffer = np.zeros(centers.shape[1], dtype=np.double) - buffer_csr = np.zeros(centers.shape[1], dtype=np.double) - - # extract a small minibatch - X_mb = X[:10] - X_mb_csr = X_csr[:10] - x_mb_squared_norms = x_squared_norms[:10] - x_mb_squared_norms_csr = x_squared_norms_csr[:10] - - sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double) - - # step 1: compute the dense minibatch update - old_inertia, incremental_diff = _mini_batch_step( - X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums, - buffer, 1, np.random.RandomState(0), random_reassign=False) - assert old_inertia > 0.0 - - # compute the new inertia on the same batch to check that it decreased - labels, new_inertia = _labels_inertia( - X_mb, sample_weight_mb, x_mb_squared_norms, new_centers) - assert new_inertia > 0.0 - assert new_inertia < old_inertia - - # check that the incremental difference computation is matching the - # final observed value - effective_diff = np.sum((new_centers - old_centers) ** 2) - assert_almost_equal(incremental_diff, effective_diff) - - # step 2: compute the sparse minibatch update - old_inertia_csr, incremental_diff_csr = _mini_batch_step( - X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr, - weight_sums_csr, buffer_csr, 1, np.random.RandomState(0), - random_reassign=False) - assert old_inertia_csr > 0.0 - - # compute the new inertia on the same batch to check that it decreased - labels_csr, new_inertia_csr = _labels_inertia( - X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr) - assert new_inertia_csr > 0.0 - assert new_inertia_csr < old_inertia_csr - - # check that the incremental difference computation is matching the - # final observed value - effective_diff = np.sum((new_centers_csr - old_centers) ** 2) - assert_almost_equal(incremental_diff_csr, effective_diff) - - # step 3: check that sparse and dense updates lead to the same results - assert_array_equal(labels, labels_csr) - assert_array_almost_equal(new_centers, new_centers_csr) - assert_almost_equal(incremental_diff, incremental_diff_csr) - assert_almost_equal(old_inertia, old_inertia_csr) - assert_almost_equal(new_inertia, new_inertia_csr) - - -def test_minibatch_with_many_reassignments(): - # Test for the case that the number of clusters to reassign is bigger - # than the batch_size - n_samples = 1000 - rnd = np.random.RandomState(42) - X = rnd.uniform(size=(n_samples, 10)) - # Check that the fit works if n_clusters is bigger than the batch_size. - # Run the test with 550 clusters and 550 samples, because it turned out - # that this values ensure that the number of clusters to reassign - # is always bigger than the batch_size - n_clusters = 1000 - MiniBatchKMeans(n_clusters=n_clusters, - batch_size=100, - init_size=n_samples, - random_state=42, - verbose=True).fit(X) - assert False From 0523c656a09ddb070579aaa0b8789fe971b89581 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 17 Jun 2020 13:02:07 +0200 Subject: [PATCH 20/72] wip --- sklearn/cluster/_kmeans.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 63d25baea3e54..033819c094018 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1477,6 +1477,13 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, # batch_size changes batch_inertia /= self.batch_size + # Ignore first iteration because it's inertia from initialization. + if iteration_idx == 0: + if self.verbose: + print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: " + f"mean batch inertia: {batch_inertia}") + return False + # Compute an Exponentially Weighted Average of the inertia to # monitor the convergence while discarding minibatch-local stochastic # variability: https://en.wikipedia.org/wiki/Moving_average From 2d789aa510e28a2a77d3b6dfb8015c1c6690e442 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 17 Jun 2020 15:22:06 +0200 Subject: [PATCH 21/72] wip --- sklearn/cluster/tests/test_k_means.py | 28 ++++++++++----------------- 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index e0441048a4ff5..4be91c1d52bf5 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -90,32 +90,24 @@ def _sort_centers(centers): return np.sort(centers, axis=0) -@pytest.mark.parametrize("init", ["k-means++", centers], - ids=["k-means++", "ndarray"]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_weighted_vs_repeated(estimator, init): +def test_weighted_vs_repeated(): # Check that a sample weight of N should yield the same result as an N-fold - # repetition of the sample + # repetition of the sample. Valid only if init is precomputed, otherwise + # rng produces different results. Not valid for MinibatchKMeans due to rng + # to extract minibatches. sample_weight = np.random.RandomState(0).randint(1, 5, size=n_samples) X_repeat = np.repeat(X, sample_weight, axis=0) - km = estimator(init=init, n_clusters=n_clusters, random_state=0) - if estimator is MiniBatchKMeans: - km.set_params(batch_size=10) + km = KMeans(init=centers, n_init=1, n_clusters=n_clusters, random_state=0) km_weighted = clone(km).fit(X, sample_weight=sample_weight) repeated_labels = np.repeat(km_weighted.labels_, sample_weight) km_repeated = clone(km).fit(X_repeat) - # We can't expect labels to be equal because k-means++ will lead to - # a different initialization on duplicated X. - assert_allclose(v_measure_score(km_repeated.labels_, repeated_labels), 1) - - # TODO: FIXME - if estimator is not MiniBatchKMeans: - assert_allclose(km_weighted.inertia_, km_repeated.inertia_) - assert_allclose(_sort_centers(km_weighted.cluster_centers_), - _sort_centers(km_repeated.cluster_centers_)) + assert_array_equal(km_repeated.labels_, repeated_labels) + assert_allclose(km_weighted.inertia_, km_repeated.inertia_) + assert_allclose(_sort_centers(km_weighted.cluster_centers_), + _sort_centers(km_repeated.cluster_centers_)) @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) @@ -192,7 +184,7 @@ def test_centers_not_mutated(estimator, dtype): @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) def test_float_precision(estimator, data): - # TODO + # Check that the results are the same for single and double precision. km = estimator(n_init=1, random_state=0) inertia = {} From 37408e6d269e0f1567a5ef5dc42f6544e949a516 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 17 Jun 2020 15:24:22 +0200 Subject: [PATCH 22/72] wip --- sklearn/cluster/_k_means_common.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_k_means_common.pyx b/sklearn/cluster/_k_means_common.pyx index dde6fe01efa61..53c33acffc2ee 100644 --- a/sklearn/cluster/_k_means_common.pyx +++ b/sklearn/cluster/_k_means_common.pyx @@ -110,7 +110,8 @@ cpdef floating _inertia_dense( floating sq_dist = 0.0 floating inertia = 0.0 - for i in prange(n_samples, nogil=True, num_threads=n_threads): + for i in prange(n_samples, nogil=True, num_threads=n_threads, + schedule='static'): j = labels[i] sq_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0], n_features, True) @@ -143,7 +144,8 @@ cpdef floating _inertia_sparse( floating[::1] centers_squared_norms = row_norms(centers, squared=True) - for i in prange(n_samples, nogil=True, num_threads=n_threads): + for i in prange(n_samples, nogil=True, num_threads=n_threads, + schedule='static'): j = labels[i] sq_dist = _euclidean_sparse_dense( X_data[X_indptr[i]: X_indptr[i + 1]], From 78915acfb09bee2754828db1b5dddb320ad8ff4d Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 17 Jun 2020 15:48:25 +0200 Subject: [PATCH 23/72] wip --- sklearn/cluster/_k_means_minibatch.pyx | 24 ++++++++++++++-------- sklearn/cluster/_kmeans.py | 28 +++++++++++++++++--------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index ce3a62be256d7..ec5b98f201346 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -16,12 +16,13 @@ np.import_array() def _minibatch_update_dense( - np.ndarray[floating, ndim=2, mode='c'] X, # IN + np.ndarray[floating, ndim=2, mode="c"] X, # IN floating[::1] sample_weight, # IN floating[:, ::1] centers_old, # IN floating[:, ::1] centers_new, # OUT floating[::1] weight_sums, # INOUT - int[::1] labels): # IN + int[::1] labels, # IN + int n_threads): """Update of the centers for dense MiniBatchKMeans. Parameters @@ -45,6 +46,9 @@ def _minibatch_update_dense( labels : ndarray of shape (n_samples,), dtype=int labels assignment. + + n_threads : int + The number of threads to be used by openmp. """ cdef: int n_samples = X.shape[0] @@ -53,10 +57,10 @@ def _minibatch_update_dense( int *indices - with nogil, parallel(): + with nogil, parallel(num_threads=n_threads): indices = malloc(n_samples * sizeof(int)) - for i in prange(n_clusters): + for i in prange(n_clusters, schedule="static"): update_center_dense(i, &X[0, 0], sample_weight, centers_old, centers_new, weight_sums, labels, indices) @@ -123,7 +127,8 @@ def _minibatch_update_sparse( floating[:, ::1] centers_old, # IN floating[:, ::1] centers_new, # OUT floating[::1] weight_sums, # INOUT - int[::1] labels): # IN + int[::1] labels, # IN + int n_threads): """Update of the centers for sparse MiniBatchKMeans. Parameters @@ -144,9 +149,12 @@ def _minibatch_update_sparse( weight_sums : ndarray of shape (n_clusters,), dtype=floating Current sums of the accumulated weights for each center. - + labels : ndarray of shape (n_samples,), dtype=int labels assignment. + + n_threads : int + The number of threads to be used by openmp. """ cdef: floating[::1] X_data = X.data @@ -158,10 +166,10 @@ def _minibatch_update_sparse( int *indices - with nogil, parallel(): + with nogil, parallel(num_threads=n_threads): indices = malloc(n_samples * sizeof(int)) - for i in prange(n_clusters): + for i in prange(n_clusters, schedule="static"): update_center_sparse(i, X_data, X_indices, X_indptr, sample_weight, centers_old, centers_new, weight_sums, labels, indices) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 7675ed0d92235..016e2a4343ce0 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1141,7 +1141,7 @@ def _more_tags(self): def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new, weight_sums, random_state, random_reassign=False, - reassignment_ratio=0.01, verbose=False): + reassignment_ratio=0.01, verbose=False, n_threads=1): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters @@ -1184,6 +1184,9 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new, verbose : bool, default=False Controls the verbosity. + n_threads : int, default=1 + The number of OpenMP threads to use for the computation. + Returns ------- inertia : float @@ -1191,15 +1194,16 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new, """ # Perform label assignment to nearest centers labels, inertia = _labels_inertia(X, sample_weight, - x_squared_norms, centers) + x_squared_norms, centers, + n_threads=n_threads) # Update centers according to the labels if sp.issparse(X): - _minibatch_update_sparse( - X, sample_weight, centers, centers_new, weight_sums, labels) + _minibatch_update_sparse(X, sample_weight, centers, centers_new, + weight_sums, labels, n_threads) else: - _minibatch_update_dense( - X, sample_weight, centers, centers_new, weight_sums, labels) + _minibatch_update_dense(X, sample_weight, centers, centers_new, + weight_sums, labels, n_threads) # Reassign clusters that have very low weight if random_reassign and reassignment_ratio > 0: @@ -1466,7 +1470,8 @@ def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms, print('Computing label assignment and total inertia') slices = gen_batches(X.shape[0], self.batch_size) results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s], - centers) for s in slices] + centers, n_threads=self._n_threads) + for s in slices] labels, inertia = zip(*results) return np.hstack(labels), np.sum(inertia) @@ -1648,7 +1653,8 @@ def fit(self, X, y=None, sample_weight=None): random_state=random_state, random_reassign=random_reassign, reassignment_ratio=self.reassignment_ratio, - verbose=self.verbose) + verbose=self.verbose, + n_threads=self._n_threads) if self._tol > 0.0: centers_squared_diff = np.sum((centers_new - centers)**2) @@ -1747,11 +1753,13 @@ def partial_fit(self, X, y=None, sample_weight=None): random_state=self._random_state, random_reassign=random_reassign, reassignment_ratio=self.reassignment_ratio, - verbose=self.verbose) + verbose=self.verbose, + n_threads=self._n_threads) if self.compute_labels: self.labels_, self.inertia_ = _labels_inertia( - X, sample_weight, x_squared_norms, self.cluster_centers_) + X, sample_weight, x_squared_norms, self.cluster_centers_, + n_threads=self._n_threads) return self From fcc2718e507e22397504516428513e999f5bf1c0 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 17 Jun 2020 15:49:52 +0200 Subject: [PATCH 24/72] wip --- sklearn/cluster/_k_means_minibatch.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index ec5b98f201346..49af1c7426d0a 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -43,7 +43,7 @@ def _minibatch_update_dense( weight_sums : ndarray of shape (n_clusters,), dtype=floating Current sums of the accumulated weights for each center. - + labels : ndarray of shape (n_samples,), dtype=int labels assignment. @@ -56,14 +56,14 @@ def _minibatch_update_dense( int i int *indices - + with nogil, parallel(num_threads=n_threads): indices = malloc(n_samples * sizeof(int)) for i in prange(n_clusters, schedule="static"): update_center_dense(i, &X[0, 0], sample_weight, centers_old, centers_new, weight_sums, labels, indices) - + free(indices) @@ -165,7 +165,7 @@ def _minibatch_update_sparse( int i int *indices - + with nogil, parallel(num_threads=n_threads): indices = malloc(n_samples * sizeof(int)) @@ -173,7 +173,7 @@ def _minibatch_update_sparse( update_center_sparse(i, X_data, X_indices, X_indptr, sample_weight, centers_old, centers_new, weight_sums, labels, indices) - + free(indices) From 73f1bc2ee84dbfa541210e9b4ae449731080e3fc Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 17 Jun 2020 17:13:21 +0200 Subject: [PATCH 25/72] wip --- sklearn/cluster/_kmeans.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 016e2a4343ce0..7cff7e91ae153 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1601,13 +1601,10 @@ def fit(self, X, y=None, sample_weight=None): X, x_squared_norms=x_squared_norms, init=init, random_state=random_state, init_size=self._init_size) - # Preform one iteration of KMeans to make the centers being the - # mean of their cluster. - labels, inertia, cluster_centers, _ = _kmeans_single_lloyd( - X=X_valid, x_squared_norms=x_squared_norms_valid, - sample_weight=sample_weight_valid, - centers_init=cluster_centers, max_iter=1, tol=0, - n_threads=self._n_threads) + # Compute inertia on a validation set. + _, inertia = _labels_inertia( + X_valid, sample_weight_valid, x_squared_norms_valid, + cluster_centers, n_threads=self._n_threads) if self.verbose: print(f"Inertia for init {init_idx + 1}/{self._n_init}: " From 7325c89a586eb57afabd2c25d2d88688783628b8 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 17 Jun 2020 18:17:07 +0200 Subject: [PATCH 26/72] wip --- sklearn/cluster/tests/test_k_means.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 4be91c1d52bf5..4d55f512ab45e 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -347,6 +347,17 @@ def test_fit_transform(estimator): assert_allclose(X1, X2) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_sample_weight_unchanged(estimator): + # Check that sample_weight is not modified in place by KMeans (#17204) + X = np.array([[1], [2], [4]]) + sample_weight = np.array([0.5, 0.2, 0.3]) + estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight) + + # internally, sample_weight is rescale to sum up to n_samples = 3 + assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3])) + + @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) def test_verbose(estimator): # Check verbose mode of KMeans and MiniBatchKMeans for better coverage. @@ -893,13 +904,3 @@ def test_n_jobs_deprecated(n_jobs): with pytest.warns(FutureWarning, match=depr_msg): kmeans.fit(X) - - -def test_sample_weight_unchanged(): - # Check that sample_weight is not modified in place by KMeans (#17204) - X = np.array([[1], [2], [4]]) - sample_weight = np.array([0.5, 0.2, 0.3]) - KMeans(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight) - - # internally, sample_weight is rescale to sum up to n_samples = 3 - assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3])) From a82456632219c11fa027a3a2299d0495c47c915f Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 18 Jun 2020 13:11:48 +0200 Subject: [PATCH 27/72] wip --- sklearn/cluster/_kmeans.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 7cff7e91ae153..ea097caa61a35 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1381,10 +1381,10 @@ class MiniBatchKMeans(KMeans): ... batch_size=6, ... max_iter=10).fit(X) >>> kmeans.cluster_centers_ - array([[3.95918367, 2.40816327], - [1.12195122, 1.3902439 ]]) + array([[2.32394366, 1.16901408], + [3.4 , 4.36 ]]) >>> kmeans.predict([[0, 0], [4, 4]]) - array([1, 0], dtype=int32) + array([0, 1], dtype=int32) """ @_deprecate_positional_args def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, From a4edafb8b046482ecab939cb30d80cff7b27b349 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 6 Jul 2020 13:54:31 +0200 Subject: [PATCH 28/72] reduce diff --- sklearn/cluster/tests/test_k_means.py | 172 +++++++++++++------------- 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 4d55f512ab45e..55074307a3ba5 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -57,6 +57,92 @@ def _check_fitted_model(km): assert km.inertia_ > 0.0 +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("algo", ["full", "elkan"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_kmeans_results(array_constr, algo, dtype): + # Checks that KMeans works as intended on toy dataset by comparing with + # expected results computed by hand. + X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) + sample_weight = [3, 1, 1, 3] + init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) + + expected_labels = [0, 0, 1, 1] + expected_inertia = 0.375 + expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) + expected_n_iter = 2 + + kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) + kmeans.fit(X, sample_weight=sample_weight) + + assert_array_equal(kmeans.labels_, expected_labels) + assert_allclose(kmeans.inertia_, expected_inertia) + assert_allclose(kmeans.cluster_centers_, expected_centers) + assert kmeans.n_iter_ == expected_n_iter + + +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +def test_relocate_empty_clusters(array_constr): + # test for the _relocate_empty_clusters_(dense/sparse) helpers + + # Synthetic dataset with 3 obvious clusters of different sizes + X = np.array( + [-10., -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1) + X = array_constr(X) + sample_weight = np.ones(10) + + # centers all initialized to the first point of X + centers_old = np.array([-10., -10, -10]).reshape(-1, 1) + + # With this initialization, all points will be assigned to the first center + # At this point a center in centers_new is the weighted sum of the points + # it contains if it's not empty, otherwise it is the same as before. + centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1) + weight_in_clusters = np.array([10., 0, 0]) + labels = np.zeros(10, dtype=np.int32) + + if array_constr is np.array: + _relocate_empty_clusters_dense(X, sample_weight, centers_old, + centers_new, weight_in_clusters, labels) + else: + _relocate_empty_clusters_sparse(X.data, X.indices, X.indptr, + sample_weight, centers_old, + centers_new, weight_in_clusters, + labels) + + # The relocation scheme will take the 2 points farthest from the center and + # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The + # first center will be updated to contain the other 8 points. + assert_array_equal(weight_in_clusters, [8, 1, 1]) + assert_allclose(centers_new, [[-36], [10], [9.5]]) + + +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("algo", ["full", "elkan"]) +def test_kmeans_relocated_clusters(array_constr, algo): + # check that empty clusters are relocated as expected + X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) + + # second center too far from others points will be empty at first iter + init_centers = np.array([[0.5, 0.5], [3, 3]]) + + expected_labels = [0, 0, 1, 1] + expected_inertia = 0.25 + expected_centers = [[0.25, 0], [0.75, 1]] + expected_n_iter = 3 + + kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) + kmeans.fit(X) + + assert_array_equal(kmeans.labels_, expected_labels) + assert_allclose(kmeans.inertia_, expected_inertia) + assert_allclose(kmeans.cluster_centers_, expected_centers) + assert kmeans.n_iter_ == expected_n_iter + + @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) @pytest.mark.parametrize("init", ["random", "k-means++", centers, lambda X, k, random_state: centers], @@ -370,31 +456,6 @@ def test_verbose(estimator): sys.stdout = old_stdout -@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], - ids=["dense", "sparse"]) -@pytest.mark.parametrize("algo", ["full", "elkan"]) -@pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_kmeans_results(array_constr, algo, dtype): - # Checks that KMeans works as intended on toy dataset by comparing with - # expected results computed by hand. - X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) - sample_weight = [3, 1, 1, 3] - init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) - - expected_labels = [0, 0, 1, 1] - expected_inertia = 0.375 - expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) - expected_n_iter = 2 - - kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) - kmeans.fit(X, sample_weight=sample_weight) - - assert_array_equal(kmeans.labels_, expected_labels) - assert_allclose(kmeans.inertia_, expected_inertia) - assert_allclose(kmeans.cluster_centers_, expected_centers) - assert kmeans.n_iter_ == expected_n_iter - - @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]) @pytest.mark.parametrize("algo", ["full", "elkan"]) @@ -497,30 +558,6 @@ def test_kmeans_elkan_iter_attribute(): assert km.n_iter_ == 1 -@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], - ids=["dense", "sparse"]) -@pytest.mark.parametrize("algo", ["full", "elkan"]) -def test_kmeans_relocated_clusters(array_constr, algo): - # check that empty clusters are relocated as expected - X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) - - # second center too far from others points will be empty at first iter - init_centers = np.array([[0.5, 0.5], [3, 3]]) - - expected_labels = [0, 0, 1, 1] - expected_inertia = 0.25 - expected_centers = [[0.25, 0], [0.75, 1]] - expected_n_iter = 3 - - kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) - kmeans.fit(X) - - assert_array_equal(kmeans.labels_, expected_labels) - assert_allclose(kmeans.inertia_, expected_inertia) - assert_allclose(kmeans.cluster_centers_, expected_centers) - assert kmeans.n_iter_ == expected_n_iter - - @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]) def test_kmeans_empty_cluster_relocated(array_constr): @@ -537,43 +574,6 @@ def test_kmeans_empty_cluster_relocated(array_constr): assert_allclose(km.cluster_centers_, [[-1], [1]]) -@pytest.mark.parametrize("representation", ["dense", "sparse"]) -def test_relocate_empty_clusters(representation): - # test for the _relocate_empty_clusters_(dense/sparse) helpers - - # Synthetic dataset with 3 obvious clusters of different sizes - X = np.array( - [-10., -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1) - if representation == "sparse": - X = sp.csr_matrix(X) - sample_weight = np.ones(10) - - # centers all initialized to the first point of X - centers_old = np.array([-10., -10, -10]).reshape(-1, 1) - - # With this initialization, all points will be assigned to the first center - # At this point a center in centers_new is the weighted sum of the points - # it contains if it's not empty, otherwise it is the same as before. - centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1) - weight_in_clusters = np.array([10., 0, 0]) - labels = np.zeros(10, dtype=np.int32) - - if representation == "dense": - _relocate_empty_clusters_dense(X, sample_weight, centers_old, - centers_new, weight_in_clusters, labels) - else: - _relocate_empty_clusters_sparse(X.data, X.indices, X.indptr, - sample_weight, centers_old, - centers_new, weight_in_clusters, - labels) - - # The relocation scheme will take the 2 points farthest from the center and - # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The - # first center will be updated to contain the other 8 points. - assert_array_equal(weight_in_clusters, [8, 1, 1]) - assert_allclose(centers_new, [[-36], [10], [9.5]]) - - @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("squared", [True, False]) def test_euclidean_distance(dtype, squared): From 0993a85eefb65b76937543b031abff699316142f Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 6 Jul 2020 13:58:00 +0200 Subject: [PATCH 29/72] reduce diff --- sklearn/cluster/tests/test_k_means.py | 48 +++++++++++++-------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 55074307a3ba5..9f4e60edd914d 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -82,6 +82,30 @@ def test_kmeans_results(array_constr, algo, dtype): assert kmeans.n_iter_ == expected_n_iter +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("algo", ["full", "elkan"]) +def test_kmeans_relocated_clusters(array_constr, algo): + # check that empty clusters are relocated as expected + X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) + + # second center too far from others points will be empty at first iter + init_centers = np.array([[0.5, 0.5], [3, 3]]) + + expected_labels = [0, 0, 1, 1] + expected_inertia = 0.25 + expected_centers = [[0.25, 0], [0.75, 1]] + expected_n_iter = 3 + + kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) + kmeans.fit(X) + + assert_array_equal(kmeans.labels_, expected_labels) + assert_allclose(kmeans.inertia_, expected_inertia) + assert_allclose(kmeans.cluster_centers_, expected_centers) + assert kmeans.n_iter_ == expected_n_iter + + @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]) def test_relocate_empty_clusters(array_constr): @@ -119,30 +143,6 @@ def test_relocate_empty_clusters(array_constr): assert_allclose(centers_new, [[-36], [10], [9.5]]) -@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], - ids=["dense", "sparse"]) -@pytest.mark.parametrize("algo", ["full", "elkan"]) -def test_kmeans_relocated_clusters(array_constr, algo): - # check that empty clusters are relocated as expected - X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) - - # second center too far from others points will be empty at first iter - init_centers = np.array([[0.5, 0.5], [3, 3]]) - - expected_labels = [0, 0, 1, 1] - expected_inertia = 0.25 - expected_centers = [[0.25, 0], [0.75, 1]] - expected_n_iter = 3 - - kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) - kmeans.fit(X) - - assert_array_equal(kmeans.labels_, expected_labels) - assert_allclose(kmeans.inertia_, expected_inertia) - assert_allclose(kmeans.cluster_centers_, expected_centers) - assert kmeans.n_iter_ == expected_n_iter - - @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) @pytest.mark.parametrize("init", ["random", "k-means++", centers, lambda X, k, random_state: centers], From b3089615f233cb0df64efd006362305ee8362e36 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 6 Jul 2020 14:05:58 +0200 Subject: [PATCH 30/72] reduce diff --- sklearn/cluster/tests/test_k_means.py | 88 +++++++++++++-------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 9f4e60edd914d..e555c2ba9fcf3 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -143,6 +143,50 @@ def test_relocate_empty_clusters(array_constr): assert_allclose(centers_new, [[-36], [10], [9.5]]) +@pytest.mark.parametrize("distribution", ["normal", "blobs"]) +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8]) +def test_kmeans_elkan_results(distribution, array_constr, tol): + # Check that results are identical between lloyd and elkan algorithms + rnd = np.random.RandomState(0) + if distribution == "normal": + X = rnd.normal(size=(5000, 10)) + else: + X, _ = make_blobs(random_state=rnd) + X[X < 0] = 0 + X = array_constr(X) + + km_full = KMeans(algorithm="full", n_clusters=5, + random_state=0, n_init=1, tol=tol) + km_elkan = KMeans(algorithm="elkan", n_clusters=5, + random_state=0, n_init=1, tol=tol) + + km_full.fit(X) + km_elkan.fit(X) + assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) + assert_array_equal(km_elkan.labels_, km_full.labels_) + assert km_elkan.n_iter_ == km_full.n_iter_ + assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6) + + +@pytest.mark.parametrize("algorithm", ["full", "elkan"]) +def test_kmeans_convergence(algorithm): + # Check that KMeans stops when convergence is reached when tol=0. (#16075) + # We can only ensure that if the number of threads is not to large, + # otherwise the roundings errors coming from the unpredictability of + # the order in which chunks are processed make the convergence criterion + # to never be exactly 0. + rnd = np.random.RandomState(0) + X = rnd.normal(size=(5000, 10)) + + with threadpool_limits(limits=1, user_api="openmp"): + km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0, + n_init=1, tol=0, max_iter=300).fit(X) + + assert km.n_iter_ < 300 + + @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) @pytest.mark.parametrize("init", ["random", "k-means++", centers, lambda X, k, random_state: centers], @@ -485,50 +529,6 @@ def py_kmeans(X, init): assert_allclose(py_centers, cy_centers) -@pytest.mark.parametrize("distribution", ["normal", "blobs"]) -@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], - ids=["dense", "sparse"]) -@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8]) -def test_kmeans_elkan_results(distribution, array_constr, tol): - # Check that results are identical between lloyd and elkan algorithms - rnd = np.random.RandomState(0) - if distribution == "normal": - X = rnd.normal(size=(5000, 10)) - else: - X, _ = make_blobs(random_state=rnd) - X[X < 0] = 0 - X = array_constr(X) - - km_full = KMeans(algorithm="full", n_clusters=5, - random_state=0, n_init=1, tol=tol) - km_elkan = KMeans(algorithm="elkan", n_clusters=5, - random_state=0, n_init=1, tol=tol) - - km_full.fit(X) - km_elkan.fit(X) - assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) - assert_array_equal(km_elkan.labels_, km_full.labels_) - assert km_elkan.n_iter_ == km_full.n_iter_ - assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6) - - -@pytest.mark.parametrize("algorithm", ["full", "elkan"]) -def test_kmeans_convergence(algorithm): - # Check that KMeans stops when convergence is reached when tol=0. (#16075) - # We can only ensure that if the number of threads is not to large, - # otherwise the roundings errors coming from the unpredictability of - # the order in which chunks are processed make the convergence criterion - # to never be exactly 0. - rnd = np.random.RandomState(0) - X = rnd.normal(size=(5000, 10)) - - with threadpool_limits(limits=1, user_api="openmp"): - km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0, - n_init=1, tol=0, max_iter=300).fit(X) - - assert km.n_iter_ < 300 - - def test_kmeans_copyx(): # Check that copy_x=False returns nearly equal X after de-centering. my_X = X.copy() From 121450b3df8cb5887641ccd5c3c20e3bd8ee813f Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 6 Jul 2020 15:51:27 +0200 Subject: [PATCH 31/72] reduce diff --- sklearn/cluster/tests/test_k_means.py | 114 +++++++++++++------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index e555c2ba9fcf3..031567b9116d3 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -187,6 +187,63 @@ def test_kmeans_convergence(algorithm): assert km.n_iter_ < 300 +def test_minibatch_update_consistency(): + # Check that dense and sparse minibatch update give the same results + rng = np.random.RandomState(42) + + centers_old = centers + rng.normal(size=centers.shape) + centers_old_csr = centers_old.copy() + + centers_new = np.zeros_like(centers_old) + centers_new_csr = np.zeros_like(centers_old_csr) + + weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype) + weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype) + + x_squared_norms = (X ** 2).sum(axis=1) + x_squared_norms_csr = row_norms(X_csr, squared=True) + + sample_weight = np.ones(X.shape[0], dtype=X.dtype) + + # extract a small minibatch + X_mb = X[:10] + X_mb_csr = X_csr[:10] + x_mb_squared_norms = x_squared_norms[:10] + x_mb_squared_norms_csr = x_squared_norms_csr[:10] + sample_weight_mb = sample_weight[:10] + + # step 1: compute the dense minibatch update + old_inertia = _mini_batch_step( + X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new, + weight_sums, np.random.RandomState(0), random_reassign=False) + assert old_inertia > 0.0 + + # compute the new inertia on the same batch to check that it decreased + labels, new_inertia = _labels_inertia( + X_mb, sample_weight_mb, x_mb_squared_norms, centers_new) + assert new_inertia > 0.0 + assert new_inertia < old_inertia + + # step 2: compute the sparse minibatch update + old_inertia_csr = _mini_batch_step( + X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr, + centers_new_csr, weight_sums_csr, np.random.RandomState(0), + random_reassign=False) + assert old_inertia_csr > 0.0 + + # compute the new inertia on the same batch to check that it decreased + labels_csr, new_inertia_csr = _labels_inertia( + X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr) + assert new_inertia_csr > 0.0 + assert new_inertia_csr < old_inertia_csr + + # step 3: check that sparse and dense updates lead to the same results + assert_array_equal(labels, labels_csr) + assert_allclose(centers_new, centers_new_csr) + assert_allclose(old_inertia, old_inertia_csr) + assert_allclose(new_inertia, new_inertia_csr) + + @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) @pytest.mark.parametrize("init", ["random", "k-means++", centers, lambda X, k, random_state: centers], @@ -635,63 +692,6 @@ def test_k_means_function(): assert inertia > 0.0 -def test_minibatch_update_consistency(): - # Check that dense and sparse minibatch update give the same results - rng = np.random.RandomState(42) - - centers_old = centers + rng.normal(size=centers.shape) - centers_old_csr = centers_old.copy() - - centers_new = np.zeros_like(centers_old) - centers_new_csr = np.zeros_like(centers_old_csr) - - weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype) - weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype) - - x_squared_norms = (X ** 2).sum(axis=1) - x_squared_norms_csr = row_norms(X_csr, squared=True) - - sample_weight = np.ones(X.shape[0], dtype=X.dtype) - - # extract a small minibatch - X_mb = X[:10] - X_mb_csr = X_csr[:10] - x_mb_squared_norms = x_squared_norms[:10] - x_mb_squared_norms_csr = x_squared_norms_csr[:10] - sample_weight_mb = sample_weight[:10] - - # step 1: compute the dense minibatch update - old_inertia = _mini_batch_step( - X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new, - weight_sums, np.random.RandomState(0), random_reassign=False) - assert old_inertia > 0.0 - - # compute the new inertia on the same batch to check that it decreased - labels, new_inertia = _labels_inertia( - X_mb, sample_weight_mb, x_mb_squared_norms, centers_new) - assert new_inertia > 0.0 - assert new_inertia < old_inertia - - # step 2: compute the sparse minibatch update - old_inertia_csr = _mini_batch_step( - X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr, - centers_new_csr, weight_sums_csr, np.random.RandomState(0), - random_reassign=False) - assert old_inertia_csr > 0.0 - - # compute the new inertia on the same batch to check that it decreased - labels_csr, new_inertia_csr = _labels_inertia( - X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr) - assert new_inertia_csr > 0.0 - assert new_inertia_csr < old_inertia_csr - - # step 3: check that sparse and dense updates lead to the same results - assert_array_equal(labels, labels_csr) - assert_allclose(centers_new, centers_new_csr) - assert_allclose(old_inertia, old_inertia_csr) - assert_allclose(new_inertia, new_inertia_csr) - - def test_minibatch_kmeans_init_size(): # Check the internal _init_size attribute of MiniBatchKMeans From 6c67dd10a6be4c8ce12af910ec0cca5eac2f7294 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 8 Jul 2020 16:48:38 +0200 Subject: [PATCH 32/72] reduce diff --- sklearn/cluster/tests/test_k_means.py | 372 +++++++++++++------------- 1 file changed, 186 insertions(+), 186 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 071a7ab213bdd..a58e648b70692 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -258,6 +258,192 @@ def test_all_init(Estimator, data, init): _check_fitted_model(km) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_fortran_aligned_data(estimator): + # Check that KMeans works with fortran-aligned data. + X_fortran = np.asfortranarray(X) + centers_fortran = np.asfortranarray(centers) + + km_c = estimator(n_clusters=n_clusters, init=centers, n_init=1, + random_state=42).fit(X) + km_f = estimator(n_clusters=n_clusters, init=centers_fortran, n_init=1, + random_state=42).fit(X_fortran) + assert_allclose(km_c.cluster_centers_, km_f.cluster_centers_) + assert_array_equal(km_c.labels_, km_f.labels_) + + +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_verbose(estimator): + # Check verbose mode of KMeans and MiniBatchKMeans for better coverage. + km = estimator(n_clusters=n_clusters, random_state=42, verbose=1) + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + km.fit(X) + finally: + sys.stdout = old_stdout + + +def test_minibatch_sensible_reassign(): + # check that identical initial clusters are reassigned + # also a regression test for when there are more desired reassignments than + # samples. + zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, + random_state=42) + zeroed_X[::2, :] = 0 + + km = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, + init="random").fit(zeroed_X) + # there should not be too many exact zero cluster centers + assert km.cluster_centers_.any(axis=1).sum() > 10 + + # do the same with batch-size > X.shape[0] (regression test) + km = MiniBatchKMeans(n_clusters=20, batch_size=200, random_state=42, + init="random").fit(zeroed_X) + # there should not be too many exact zero cluster centers + assert km.cluster_centers_.any(axis=1).sum() > 10 + + # do the same with partial_fit API + km = MiniBatchKMeans(n_clusters=20, random_state=42, init="random") + for i in range(100): + km.partial_fit(zeroed_X) + # there should not be too many exact zero cluster centers + assert km.cluster_centers_.any(axis=1).sum() > 10 + + +@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) +def test_minibatch_reassign(data): + # Check the reassignment part of the minibatch step with very high or very + # low reassignment ratio. + perfect_centers = np.empty((n_clusters, n_features)) + for i in range(n_clusters): + perfect_centers[i] = X[true_labels == i].mean(axis=0) + + x_squared_norms = row_norms(data, squared=True) + sample_weight = np.ones(n_samples) + centers_new = np.empty_like(perfect_centers) + + # Give a perfect initialization, but a large reassignment_ratio, as a + # result many centers should be reassigned and the model should no longer + # be good + score_before = - _labels_inertia(data, sample_weight, x_squared_norms, + perfect_centers, 1)[1] + + _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers, + centers_new, np.zeros(n_clusters), + np.random.RandomState(0), random_reassign=True, + reassignment_ratio=1) + + score_after = - _labels_inertia(data, sample_weight, x_squared_norms, + centers_new, 1)[1] + + assert score_before > score_after + + # Give a perfect initialization, with a small reassignment_ratio, + # no center should be reassigned. + _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers, + centers_new, np.zeros(n_clusters), + np.random.RandomState(0), random_reassign=True, + reassignment_ratio=1e-15) + + assert_allclose(centers_new, perfect_centers) + + +def test_minibatch_with_many_reassignments(): + # Test for the case that the number of clusters to reassign is bigger + # than the batch_size. Run the test with 100 clusters and a batch_size of + # 10 because it turned out that these values ensure that the number of + # clusters to reassign is always bigger than the batch_size. + MiniBatchKMeans(n_clusters=100, + batch_size=10, + init_size=n_samples, + random_state=42, + verbose=True).fit(X) + + +def test_minibatch_kmeans_init_size(): + # Check the internal _init_size attribute of MiniBatchKMeans + + # default init size should be 3 * batch_size + km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1).fit(X) + assert km._init_size == 15 + + # if 3 * batch size < n_clusters, it should then be 3 * n_clusters + km = MiniBatchKMeans(n_clusters=10, batch_size=1, n_init=1).fit(X) + assert km._init_size == 30 + + # it should not be larger than n_samples + km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1, + init_size=n_samples + 1).fit(X) + assert km._init_size == n_samples + + +def test_kmeans_copyx(): + # Check that copy_x=False returns nearly equal X after de-centering. + my_X = X.copy() + km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42) + km.fit(my_X) + _check_fitted_model(km) + + # check that my_X is de-centered + assert_allclose(my_X, X) + + +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_score_max_iter(estimator): + # Check that fitting KMeans or MiniBatchKMeans with more iterations gives + # better score + X = np.random.RandomState(0).randn(100, 10) + + km1 = estimator(n_init=1, random_state=42, max_iter=1) + s1 = km1.fit(X).score(X) + km2 = estimator(n_init=1, random_state=42, max_iter=10) + s2 = km2.fit(X).score(X) + assert s2 > s1 + + +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("init", ["random", "k-means++", "ndarray"]) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_predict(estimator, init, dtype, array_constr): + # Check the predict method and the equivalence between fit.predict and + # fit_predict. + if sys.platform == "darwin": + pytest.xfail( + "Known failures on MacOS, See " + "https://github.com/scikit-learn/scikit-learn/issues/12644") + + X, _ = make_blobs(n_samples=500, n_features=10, centers=10, random_state=0) + + n_init = 1 if init == "ndarray" else 10 + init = X[:10] if init == "ndarray" else init + X = array_constr(X) + + km = estimator(n_clusters=10, init=init, n_init=n_init, + random_state=0).fit(X) + labels = km.labels_ + + # Due to randomness in the order in which chunks of data are processed when + # using more than one thread, there might be different rounding errors for + # the computation of the inertia for each init between 2 runs. This might + # result in a different ranking of the inits, hence a different labeling, + # which should still correspond to the same clustering + + # re-predict labels for training set using predict + pred = km.predict(X) + assert_allclose(v_measure_score(pred, labels), 1) + + # re-predict labels for training set using fit_predict + pred = km.fit_predict(X) + assert_allclose(v_measure_score(pred, labels), 1) + + # predict centroid labels + pred = km.predict(km.cluster_centers_) + assert_allclose(v_measure_score(pred, np.arange(10)), 1) + + @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) def test_result_equal_in_diff_n_threads(estimator): # Check that KMeans/MiniBatchKMeans give the same results in parallel mode @@ -340,20 +526,6 @@ def test_dense_sparse(estimator): assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_fortran_aligned_data(estimator): - # Check that KMeans works with fortran-aligned data. - X_fortran = np.asfortranarray(X) - centers_fortran = np.asfortranarray(centers) - - km_c = estimator(n_clusters=n_clusters, init=centers, n_init=1, - random_state=42).fit(X) - km_f = estimator(n_clusters=n_clusters, init=centers_fortran, n_init=1, - random_state=42).fit(X_fortran) - assert_allclose(km_c.cluster_centers_, km_f.cluster_centers_) - assert_array_equal(km_c.labels_, km_f.labels_) - - @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) def test_centers_not_mutated(estimator, dtype): @@ -405,19 +577,6 @@ def test_float_precision(Estimator, data): assert_array_equal(labels[np.float32], labels[np.float64]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_score_max_iter(estimator): - # Check that fitting KMeans or MiniBatchKMeans with more iterations gives - # better score - X = np.random.RandomState(0).randn(100, 10) - - km1 = estimator(n_init=1, random_state=42, max_iter=1) - s1 = km1.fit(X).score(X) - km2 = estimator(n_init=1, random_state=42, max_iter=10) - s2 = km2.fit(X).score(X) - assert s2 > s1 - - @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]) @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @@ -449,48 +608,6 @@ def test_integer_input(estimator, array_constr, dtype, init): assert km.cluster_centers_.dtype == np.float64 -@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], - ids=["dense", "sparse"]) -@pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize("init", ["random", "k-means++", "ndarray"]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_predict(estimator, init, dtype, array_constr): - # Check the predict method and the equivalence between fit.predict and - # fit_predict. - if sys.platform == "darwin": - pytest.xfail( - "Known failures on MacOS, See " - "https://github.com/scikit-learn/scikit-learn/issues/12644") - - X, _ = make_blobs(n_samples=500, n_features=10, centers=10, random_state=0) - - n_init = 1 if init == "ndarray" else 10 - init = X[:10] if init == "ndarray" else init - X = array_constr(X) - - km = estimator(n_clusters=10, init=init, n_init=n_init, - random_state=0).fit(X) - labels = km.labels_ - - # Due to randomness in the order in which chunks of data are processed when - # using more than one thread, there might be different rounding errors for - # the computation of the inertia for each init between 2 runs. This might - # result in a different ranking of the inits, hence a different labeling, - # which should still correspond to the same clustering - - # re-predict labels for training set using predict - pred = km.predict(X) - assert_allclose(v_measure_score(pred, labels), 1) - - # re-predict labels for training set using fit_predict - pred = km.fit_predict(X) - assert_allclose(v_measure_score(pred, labels), 1) - - # predict centroid labels - pred = km.predict(km.cluster_centers_) - assert_allclose(v_measure_score(pred, np.arange(10)), 1) - - @pytest.mark.parametrize("init", ["random", "k-means++", centers], ids=["random", "k-means++", "ndarray"]) @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) @@ -545,18 +662,6 @@ def test_sample_weight_unchanged(estimator): assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3])) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_verbose(estimator): - # Check verbose mode of KMeans and MiniBatchKMeans for better coverage. - km = estimator(n_clusters=n_clusters, random_state=42, verbose=1) - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - km.fit(X) - finally: - sys.stdout = old_stdout - - @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]) @pytest.mark.parametrize("algo", ["full", "elkan"]) @@ -586,17 +691,6 @@ def py_kmeans(X, init): assert_allclose(py_centers, cy_centers) -def test_kmeans_copyx(): - # Check that copy_x=False returns nearly equal X after de-centering. - my_X = X.copy() - km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42) - km.fit(my_X) - _check_fitted_model(km) - - # check that my_X is de-centered - assert_allclose(my_X, X) - - @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) def test_kmeans_init_fitted_centers(data): # Check that starting fitting from a local optimum shouldn't change the @@ -692,100 +786,6 @@ def test_k_means_function(): assert inertia > 0.0 -def test_minibatch_kmeans_init_size(): - # Check the internal _init_size attribute of MiniBatchKMeans - - # default init size should be 3 * batch_size - km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1).fit(X) - assert km._init_size == 15 - - # if 3 * batch size < n_clusters, it should then be 3 * n_clusters - km = MiniBatchKMeans(n_clusters=10, batch_size=1, n_init=1).fit(X) - assert km._init_size == 30 - - # it should not be larger than n_samples - km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1, - init_size=n_samples + 1).fit(X) - assert km._init_size == n_samples - - -def test_minibatch_sensible_reassign(): - # check that identical initial clusters are reassigned - # also a regression test for when there are more desired reassignments than - # samples. - zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, - random_state=42) - zeroed_X[::2, :] = 0 - - km = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, - init="random").fit(zeroed_X) - # there should not be too many exact zero cluster centers - assert km.cluster_centers_.any(axis=1).sum() > 10 - - # do the same with batch-size > X.shape[0] (regression test) - km = MiniBatchKMeans(n_clusters=20, batch_size=200, random_state=42, - init="random").fit(zeroed_X) - # there should not be too many exact zero cluster centers - assert km.cluster_centers_.any(axis=1).sum() > 10 - - # do the same with partial_fit API - km = MiniBatchKMeans(n_clusters=20, random_state=42, init="random") - for i in range(100): - km.partial_fit(zeroed_X) - # there should not be too many exact zero cluster centers - assert km.cluster_centers_.any(axis=1).sum() > 10 - - -@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) -def test_minibatch_reassign(data): - # Check the reassignment part of the minibatch step with very high or very - # low reassignment ratio. - perfect_centers = np.empty((n_clusters, n_features)) - for i in range(n_clusters): - perfect_centers[i] = X[true_labels == i].mean(axis=0) - - x_squared_norms = row_norms(data, squared=True) - sample_weight = np.ones(n_samples) - centers_new = np.empty_like(perfect_centers) - - # Give a perfect initialization, but a large reassignment_ratio, as a - # result many centers should be reassigned and the model should no longer - # be good - score_before = - _labels_inertia(data, sample_weight, x_squared_norms, - perfect_centers, 1)[1] - - _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers, - centers_new, np.zeros(n_clusters), - np.random.RandomState(0), random_reassign=True, - reassignment_ratio=1) - - score_after = - _labels_inertia(data, sample_weight, x_squared_norms, - centers_new, 1)[1] - - assert score_before > score_after - - # Give a perfect initialization, with a small reassignment_ratio, - # no center should be reassigned. - _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers, - centers_new, np.zeros(n_clusters), - np.random.RandomState(0), random_reassign=True, - reassignment_ratio=1e-15) - - assert_allclose(centers_new, perfect_centers) - - -def test_minibatch_with_many_reassignments(): - # Test for the case that the number of clusters to reassign is bigger - # than the batch_size. Run the test with 100 clusters and a batch_size of - # 10 because it turned out that these values ensure that the number of - # clusters to reassign is always bigger than the batch_size. - MiniBatchKMeans(n_clusters=100, - batch_size=10, - init_size=n_samples, - random_state=42, - verbose=True).fit(X) - - @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) @pytest.mark.parametrize("param, match", [ ({"n_init": 0}, r"n_init should be > 0"), From 6e78c7ed2f9c4c4dae901678728061a28a83f587 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 9 Jul 2020 14:33:35 +0200 Subject: [PATCH 33/72] reduce diff --- sklearn/cluster/tests/test_k_means.py | 99 ++++++++++++++------------- 1 file changed, 50 insertions(+), 49 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index a58e648b70692..2366a4f0b4156 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -464,55 +464,6 @@ def _sort_centers(centers): return np.sort(centers, axis=0) -def test_weighted_vs_repeated(): - # Check that a sample weight of N should yield the same result as an N-fold - # repetition of the sample. Valid only if init is precomputed, otherwise - # rng produces different results. Not valid for MinibatchKMeans due to rng - # to extract minibatches. - sample_weight = np.random.RandomState(0).randint(1, 5, size=n_samples) - X_repeat = np.repeat(X, sample_weight, axis=0) - - km = KMeans(init=centers, n_init=1, n_clusters=n_clusters, random_state=0) - - km_weighted = clone(km).fit(X, sample_weight=sample_weight) - repeated_labels = np.repeat(km_weighted.labels_, sample_weight) - km_repeated = clone(km).fit(X_repeat) - - assert_array_equal(km_repeated.labels_, repeated_labels) - assert_allclose(km_weighted.inertia_, km_repeated.inertia_) - assert_allclose(_sort_centers(km_weighted.cluster_centers_), - _sort_centers(km_repeated.cluster_centers_)) - - -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_unit_weights_vs_no_weights(estimator): - # Check that not passing sample weights should be equivalent to passing - # sample weights all equal to one. - sample_weight = np.ones(n_samples) - - km = estimator(n_clusters=n_clusters, random_state=42, n_init=1) - km_none = clone(km).fit(X, sample_weight=None) - km_ones = clone(km).fit(X, sample_weight=sample_weight) - - assert_array_equal(km_none.labels_, km_ones.labels_) - assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_) - - -@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_scaled_weights(estimator, data): - # Check that scaling all sample weights by a common factor - # shouldn't change the result - sample_weight = np.random.uniform(n_samples) - - km = estimator(n_clusters=n_clusters, random_state=42, n_init=1) - km_orig = clone(km).fit(data, sample_weight=sample_weight) - km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight) - - assert_array_equal(km_orig.labels_, km_scaled.labels_) - assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_) - - @pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) def test_dense_sparse(estimator): # Check that the results are the same for dense and sparse input. @@ -577,6 +528,56 @@ def test_float_precision(Estimator, data): assert_array_equal(labels[np.float32], labels[np.float64]) +def test_weighted_vs_repeated(): + # Check that a sample weight of N should yield the same result as an N-fold + # repetition of the sample. Valid only if init is precomputed, otherwise + # rng produces different results. Not valid for MinibatchKMeans due to rng + # to extract minibatches. + sample_weight = np.random.RandomState(0).randint(1, 5, size=n_samples) + X_repeat = np.repeat(X, sample_weight, axis=0) + + km = KMeans(init=centers, n_init=1, n_clusters=n_clusters, random_state=0) + + km_weighted = clone(km).fit(X, sample_weight=sample_weight) + repeated_labels = np.repeat(km_weighted.labels_, sample_weight) + km_repeated = clone(km).fit(X_repeat) + + assert_array_equal(km_repeated.labels_, repeated_labels) + assert_allclose(km_weighted.inertia_, km_repeated.inertia_) + assert_allclose(_sort_centers(km_weighted.cluster_centers_), + _sort_centers(km_repeated.cluster_centers_)) + + +@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_unit_weights_vs_no_weights(estimator, data): + # Check that not passing sample weights should be equivalent to passing + # sample weights all equal to one. + sample_weight = np.ones(n_samples) + + km = estimator(n_clusters=n_clusters, random_state=42, n_init=1) + km_none = clone(km).fit(data, sample_weight=None) + km_ones = clone(km).fit(data, sample_weight=sample_weight) + + assert_array_equal(km_none.labels_, km_ones.labels_) + assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_) + + +@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_scaled_weights(estimator, data): + # Check that scaling all sample weights by a common factor + # shouldn't change the result + sample_weight = np.random.uniform(n_samples) + + km = estimator(n_clusters=n_clusters, random_state=42, n_init=1) + km_orig = clone(km).fit(data, sample_weight=sample_weight) + km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight) + + assert_array_equal(km_orig.labels_, km_scaled.labels_) + assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_) + + @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]) @pytest.mark.parametrize("dtype", [np.int32, np.int64]) From f13441b8797db5793566261b6e153ff9f58eb9b0 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 9 Jul 2020 14:45:18 +0200 Subject: [PATCH 34/72] reduce diff --- sklearn/cluster/tests/test_k_means.py | 30 +++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 2366a4f0b4156..b4a6052b8f3e3 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -477,21 +477,6 @@ def test_dense_sparse(estimator): assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_) -@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_centers_not_mutated(estimator, dtype): - # Check that KMeans and MiniBatchKMeans won't mutate the user provided - # init centers silently even if input data and init centers have the same - # type. - X_new_type = X.astype(dtype, copy=True) - centers_new_type = centers.astype(dtype, copy=True) - - km = estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1) - km.fit(X_new_type) - - assert not np.may_share_memory(km.cluster_centers_, centers) - - @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_float_precision(Estimator, data): @@ -528,6 +513,21 @@ def test_float_precision(Estimator, data): assert_array_equal(labels[np.float32], labels[np.float64]) +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +def test_centers_not_mutated(estimator, dtype): + # Check that KMeans and MiniBatchKMeans won't mutate the user provided + # init centers silently even if input data and init centers have the same + # type. + X_new_type = X.astype(dtype, copy=True) + centers_new_type = centers.astype(dtype, copy=True) + + km = estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1) + km.fit(X_new_type) + + assert not np.may_share_memory(km.cluster_centers_, centers) + + def test_weighted_vs_repeated(): # Check that a sample weight of N should yield the same result as an N-fold # repetition of the sample. Valid only if init is precomputed, otherwise From f08d3d281c9cee6567d5affc04a2bada6279e7fe Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 10 Jul 2020 01:28:00 +0200 Subject: [PATCH 35/72] fix merge conflicts --- sklearn/cluster/_kmeans.py | 4 ++-- sklearn/cluster/tests/test_k_means.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 7cc0a1bed5b26..7662893bb5d1d 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -938,7 +938,7 @@ def fit(self, X, y=None, sample_weight=None): init = self.init if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype, copy=True, order='C') - self._validate_center_shape(X, self.n_clusters, init) + self._validate_center_shape(X, init) # subtract of mean of x for more accurate distance computations if not sp.issparse(X): @@ -1707,7 +1707,7 @@ def partial_fit(self, X, y=None, sample_weight=None): order='C', accept_large_sparse=False, reset=is_first_call_to_partial_fit) - self.random_state_ = getattr(self, "random_state_", + self._random_state = getattr(self, "_random_state", check_random_state(self.random_state)) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 2f62d1891ac23..7e09d2214a7c0 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -799,7 +799,7 @@ def test_k_means_function(): assert inertia > 0.0 -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) @pytest.mark.parametrize("param, match", [ ({"n_init": 0}, r"n_init should be > 0"), ({"max_iter": 0}, r"max_iter should be > 0"), From b712de691f1bad3330a2d0ce22ecb427cac153ec Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 10 Jul 2020 01:29:12 +0200 Subject: [PATCH 36/72] Estimator --- sklearn/cluster/tests/test_k_means.py | 96 +++++++++++++-------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 7e09d2214a7c0..934165e004619 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -284,10 +284,10 @@ def test_fortran_aligned_data(Estimator): assert_array_equal(km_c.labels_, km_f.labels_) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_verbose(estimator): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_verbose(Estimator): # Check verbose mode of KMeans and MiniBatchKMeans for better coverage. - km = estimator(n_clusters=n_clusters, random_state=42, verbose=1) + km = Estimator(n_clusters=n_clusters, random_state=42, verbose=1) old_stdout = sys.stdout sys.stdout = StringIO() try: @@ -401,15 +401,15 @@ def test_kmeans_copyx(): assert_allclose(my_X, X) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_score_max_iter(estimator): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_score_max_iter(Estimator): # Check that fitting KMeans or MiniBatchKMeans with more iterations gives # better score X = np.random.RandomState(0).randn(100, 10) - km1 = estimator(n_init=1, random_state=42, max_iter=1) + km1 = Estimator(n_init=1, random_state=42, max_iter=1) s1 = km1.fit(X).score(X) - km2 = estimator(n_init=1, random_state=42, max_iter=10) + km2 = Estimator(n_init=1, random_state=42, max_iter=10) s2 = km2.fit(X).score(X) assert s2 > s1 @@ -418,8 +418,8 @@ def test_score_max_iter(estimator): ids=["dense", "sparse"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("init", ["random", "k-means++", "ndarray"]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_predict(estimator, init, dtype, array_constr): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_predict(Estimator, init, dtype, array_constr): # Check the predict method and the equivalence between fit.predict and # fit_predict. if sys.platform == "darwin": @@ -433,7 +433,7 @@ def test_predict(estimator, init, dtype, array_constr): init = X[:10] if init == "ndarray" else init X = array_constr(X) - km = estimator(n_clusters=10, init=init, n_init=n_init, + km = Estimator(n_clusters=10, init=init, n_init=n_init, random_state=0).fit(X) labels = km.labels_ @@ -456,18 +456,18 @@ def test_predict(estimator, init, dtype, array_constr): assert_allclose(v_measure_score(pred, np.arange(10)), 1) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_result_equal_in_diff_n_threads(estimator): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_result_equal_in_diff_n_threads(Estimator): # Check that KMeans/MiniBatchKMeans give the same results in parallel mode # than in sequential mode. rnd = np.random.RandomState(0) X = rnd.normal(size=(50, 10)) with threadpool_limits(limits=1, user_api="openmp"): - result_1 = estimator( + result_1 = Estimator( n_clusters=n_clusters, random_state=0).fit(X).labels_ with threadpool_limits(limits=2, user_api="openmp"): - result_2 = estimator( + result_2 = Estimator( n_clusters=n_clusters, random_state=0).fit(X).labels_ assert_array_equal(result_1, result_2) @@ -476,13 +476,13 @@ def _sort_centers(centers): return np.sort(centers, axis=0) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_dense_sparse(estimator): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_dense_sparse(Estimator): # Check that the results are the same for dense and sparse input. sample_weight = np.random.RandomState(0).random_sample((n_samples,)) - km_dense = estimator(n_clusters=n_clusters, random_state=0, n_init=1) + km_dense = Estimator(n_clusters=n_clusters, random_state=0, n_init=1) km_dense.fit(X, sample_weight=sample_weight) - km_sparse = estimator(n_clusters=n_clusters, random_state=0, n_init=1) + km_sparse = Estimator(n_clusters=n_clusters, random_state=0, n_init=1) km_sparse.fit(X_csr, sample_weight=sample_weight) assert_array_equal(km_dense.labels_, km_sparse.labels_) @@ -526,15 +526,15 @@ def test_float_precision(Estimator, data): @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_centers_not_mutated(estimator, dtype): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_centers_not_mutated(Estimator, dtype): # Check that KMeans and MiniBatchKMeans won't mutate the user provided # init centers silently even if input data and init centers have the same # type. X_new_type = X.astype(dtype, copy=True) centers_new_type = centers.astype(dtype, copy=True) - km = estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1) + km = Estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1) km.fit(X_new_type) assert not np.may_share_memory(km.cluster_centers_, centers) @@ -561,13 +561,13 @@ def test_weighted_vs_repeated(): @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_unit_weights_vs_no_weights(estimator, data): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_unit_weights_vs_no_weights(Estimator, data): # Check that not passing sample weights should be equivalent to passing # sample weights all equal to one. sample_weight = np.ones(n_samples) - km = estimator(n_clusters=n_clusters, random_state=42, n_init=1) + km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1) km_none = clone(km).fit(data, sample_weight=None) km_ones = clone(km).fit(data, sample_weight=sample_weight) @@ -576,13 +576,13 @@ def test_unit_weights_vs_no_weights(estimator, data): @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_scaled_weights(estimator, data): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_scaled_weights(Estimator, data): # Check that scaling all sample weights by a common factor # shouldn't change the result sample_weight = np.random.uniform(n_samples) - km = estimator(n_clusters=n_clusters, random_state=42, n_init=1) + km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1) km_orig = clone(km).fit(data, sample_weight=sample_weight) km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight) @@ -594,8 +594,8 @@ def test_scaled_weights(estimator, data): ids=["dense", "sparse"]) @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("init", ["k-means++", "ndarray"]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_integer_input(estimator, array_constr, dtype, init): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_integer_input(Estimator, array_constr, dtype, init): # Check that KMeans and MiniBatchKMeans work with integer input. X_dense = np.array([[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]) X = array_constr(X_dense, dtype=dtype) @@ -603,8 +603,8 @@ def test_integer_input(estimator, array_constr, dtype, init): n_init = 1 if init == "ndarray" else 10 init = X_dense[:2] if init == "ndarray" else init - km = estimator(n_clusters=2, init=init, n_init=n_init, random_state=0) - if estimator is MiniBatchKMeans: + km = Estimator(n_clusters=2, init=init, n_init=n_init, random_state=0) + if Estimator is MiniBatchKMeans: km.set_params(batch_size=2) km.fit(X) @@ -616,19 +616,19 @@ def test_integer_input(estimator, array_constr, dtype, init): assert_allclose(v_measure_score(km.labels_, expected_labels), 1) # Same with partial_fit (#14314) - if estimator is MiniBatchKMeans: + if Estimator is MiniBatchKMeans: km = clone(km).partial_fit(X) assert km.cluster_centers_.dtype == np.float64 @pytest.mark.parametrize("init", ["random", "k-means++", centers], ids=["random", "k-means++", "ndarray"]) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_predict_dense_sparse(estimator, init): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_predict_dense_sparse(Estimator, init): # check that models trained on sparse input also works for dense input at # predict time and vice versa. n_init = 10 if type(init) is str else 1 - km = estimator(n_clusters=n_clusters, init=init, n_init=n_init, + km = Estimator(n_clusters=n_clusters, init=init, n_init=n_init, random_state=0) km.fit(X_csr) @@ -638,10 +638,10 @@ def test_predict_dense_sparse(estimator, init): assert_array_equal(km.predict(X_csr), km.labels_) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_transform(estimator): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_transform(Estimator): # Check the transform method - km = estimator(n_clusters=n_clusters).fit(X) + km = Estimator(n_clusters=n_clusters).fit(X) # Transorfming cluster_centers_ should return the pairwise distances # between centers @@ -656,20 +656,20 @@ def test_transform(estimator): assert_allclose(Xt, pairwise_distances(X, km.cluster_centers_)) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_fit_transform(estimator): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_fit_transform(Estimator): # Check equivalence between fit.transform and fit_transform - X1 = estimator(random_state=0, n_init=1).fit(X).transform(X) - X2 = estimator(random_state=0, n_init=1).fit_transform(X) + X1 = Estimator(random_state=0, n_init=1).fit(X).transform(X) + X2 = Estimator(random_state=0, n_init=1).fit_transform(X) assert_allclose(X1, X2) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_sample_weight_unchanged(estimator): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_sample_weight_unchanged(Estimator): # Check that sample_weight is not modified in place by KMeans (#17204) X = np.array([[1], [2], [4]]) sample_weight = np.array([0.5, 0.2, 0.3]) - estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight) + Estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight) # internally, sample_weight is rescale to sum up to n_samples = 3 assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3])) @@ -850,13 +850,13 @@ def test_minibatch_kmeans_wrong_params(param, match): MiniBatchKMeans(**param).fit(X) -@pytest.mark.parametrize("estimator", [KMeans, MiniBatchKMeans]) -def test_warnings(estimator): +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_warnings(Estimator): # Check warning messages common to KMeans and MiniBatchKMeans with pytest.warns(RuntimeWarning, match="Explicit initial center position passed: " "performing only one init"): - estimator(init=centers, n_clusters=n_clusters).fit(X) + Estimator(init=centers, n_clusters=n_clusters).fit(X) def test_kmeans_warnings(): From 153a06f79749838f52d162fd6fd02442c7617b28 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 17 Jul 2020 14:52:59 +0200 Subject: [PATCH 37/72] cln --- sklearn/cluster/tests/test_k_means.py | 328 +++++++++++++------------- 1 file changed, 164 insertions(+), 164 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index c2a765735ebd5..b50fd79eabc33 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -44,20 +44,6 @@ X_csr = sp.csr_matrix(X) -def _check_fitted_model(km): - # check that the number of clusters centers and distinct labels match - # the expectation - centers = km.cluster_centers_ - assert centers.shape == (n_clusters, n_features) - - labels = km.labels_ - assert np.unique(labels).shape[0] == n_clusters - - # check that the labels assignment are perfect (up to a permutation) - assert_allclose(v_measure_score(true_labels, labels), 1.0) - assert km.inertia_ > 0.0 - - @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]) @pytest.mark.parametrize("algo", ["full", "elkan"]) @@ -245,6 +231,20 @@ def test_minibatch_update_consistency(): assert_allclose(new_inertia, new_inertia_csr) +def _check_fitted_model(km): + # check that the number of clusters centers and distinct labels match + # the expectation + centers = km.cluster_centers_ + assert centers.shape == (n_clusters, n_features) + + labels = km.labels_ + assert np.unique(labels).shape[0] == n_clusters + + # check that the labels assignment are perfect (up to a permutation) + assert_allclose(v_measure_score(true_labels, labels), 1.0) + assert km.inertia_ > 0.0 + + @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) @pytest.mark.parametrize("init", ["random", "k-means++", centers, lambda X, k, random_state: centers], @@ -473,10 +473,6 @@ def test_predict(Estimator, init, dtype, array_constr): assert_allclose(v_measure_score(pred, np.arange(10)), 1) -def _sort_centers(centers): - return np.sort(centers, axis=0) - - @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_dense_sparse(Estimator): # Check that the results are the same for dense and sparse input. @@ -490,6 +486,80 @@ def test_dense_sparse(Estimator): assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_) +@pytest.mark.parametrize("init", ["random", "k-means++", centers], + ids=["random", "k-means++", "ndarray"]) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_predict_dense_sparse(Estimator, init): + # check that models trained on sparse input also works for dense input at + # predict time and vice versa. + n_init = 10 if type(init) is str else 1 + km = Estimator(n_clusters=n_clusters, init=init, n_init=n_init, + random_state=0) + + km.fit(X_csr) + assert_array_equal(km.predict(X), km.labels_) + + km.fit(X) + assert_array_equal(km.predict(X_csr), km.labels_) + + +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("dtype", [np.int32, np.int64]) +@pytest.mark.parametrize("init", ["k-means++", "ndarray"]) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_integer_input(Estimator, array_constr, dtype, init): + # Check that KMeans and MiniBatchKMeans work with integer input. + X_dense = np.array([[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]) + X = array_constr(X_dense, dtype=dtype) + + n_init = 1 if init == "ndarray" else 10 + init = X_dense[:2] if init == "ndarray" else init + + km = Estimator(n_clusters=2, init=init, n_init=n_init, random_state=0) + if Estimator is MiniBatchKMeans: + km.set_params(batch_size=2) + + km.fit(X) + + # Internally integer input should be converted to float64 + assert km.cluster_centers_.dtype == np.float64 + + expected_labels = [0, 1, 1, 0, 0, 1] + assert_allclose(v_measure_score(km.labels_, expected_labels), 1) + + # Same with partial_fit (#14314) + if Estimator is MiniBatchKMeans: + km = clone(km).partial_fit(X) + assert km.cluster_centers_.dtype == np.float64 + + +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_transform(Estimator): + # Check the transform method + km = Estimator(n_clusters=n_clusters).fit(X) + + # Transorfming cluster_centers_ should return the pairwise distances + # between centers + Xt = km.transform(km.cluster_centers_) + assert_allclose(Xt, pairwise_distances(km.cluster_centers_)) + # In particular, diagonal must be 0 + assert_array_equal(Xt.diagonal(), np.zeros(n_clusters)) + + # Transorfming X should return the pairwise distances between X and the + # centers + Xt = km.transform(X) + assert_allclose(Xt, pairwise_distances(X, km.cluster_centers_)) + + +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_fit_transform(Estimator): + # Check equivalence between fit.transform and fit_transform + X1 = Estimator(random_state=0, n_init=1).fit(X).transform(X) + X2 = Estimator(random_state=0, n_init=1).fit_transform(X) + assert_allclose(X1, X2) + + def test_k_means_function(): # test calling the k_means function directly cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, @@ -554,6 +624,17 @@ def test_centers_not_mutated(Estimator, dtype): assert not np.may_share_memory(km.cluster_centers_, centers_new_type) +@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) +def test_kmeans_init_fitted_centers(data): + # Check that starting fitting from a local optimum shouldn't change the + # solution + km1 = KMeans(n_clusters=n_clusters).fit(data) + km2 = KMeans(n_clusters=n_clusters, init=km1.cluster_centers_, + n_init=1).fit(data) + + assert_allclose(km1.cluster_centers_, km2.cluster_centers_) + + def test_kmeans_warns_less_centers_than_unique_points(): # Check KMeans when the number of found clusters is smaller than expected X = np.asarray([[0, 0], @@ -572,6 +653,10 @@ def test_kmeans_warns_less_centers_than_unique_points(): assert set(km.labels_) == set(range(3)) +def _sort_centers(centers): + return np.sort(centers, axis=0) + + def test_weighted_vs_repeated(): # Check that a sample weight of N should yield the same result as an N-fold # repetition of the sample. Valid only if init is precomputed, otherwise @@ -622,70 +707,69 @@ def test_scaled_weights(Estimator, data): assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_) -@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], - ids=["dense", "sparse"]) -@pytest.mark.parametrize("dtype", [np.int32, np.int64]) -@pytest.mark.parametrize("init", ["k-means++", "ndarray"]) -@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) -def test_integer_input(Estimator, array_constr, dtype, init): - # Check that KMeans and MiniBatchKMeans work with integer input. - X_dense = np.array([[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]) - X = array_constr(X_dense, dtype=dtype) - - n_init = 1 if init == "ndarray" else 10 - init = X_dense[:2] if init == "ndarray" else init - - km = Estimator(n_clusters=2, init=init, n_init=n_init, random_state=0) - if Estimator is MiniBatchKMeans: - km.set_params(batch_size=2) +def test_kmeans_elkan_iter_attribute(): + # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off + # it's right value (#11340). + km = KMeans(algorithm="elkan", max_iter=1).fit(X) + assert km.n_iter_ == 1 - km.fit(X) - # Internally integer input should be converted to float64 - assert km.cluster_centers_.dtype == np.float64 +@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], + ids=["dense", "sparse"]) +def test_kmeans_empty_cluster_relocated(array_constr): + # check that empty clusters are correctly relocated when using sample + # weights (#13486) + X = array_constr([[-1], [1]]) + sample_weight = [1.9, 0.1] + init = np.array([[-1], [10]]) - expected_labels = [0, 1, 1, 0, 0, 1] - assert_allclose(v_measure_score(km.labels_, expected_labels), 1) + km = KMeans(n_clusters=2, init=init, n_init=1) + km.fit(X, sample_weight=sample_weight) - # Same with partial_fit (#14314) - if Estimator is MiniBatchKMeans: - km = clone(km).partial_fit(X) - assert km.cluster_centers_.dtype == np.float64 + assert len(set(km.labels_)) == 2 + assert_allclose(km.cluster_centers_, [[-1], [1]]) -@pytest.mark.parametrize("init", ["random", "k-means++", centers], - ids=["random", "k-means++", "ndarray"]) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) -def test_predict_dense_sparse(Estimator, init): - # check that models trained on sparse input also works for dense input at - # predict time and vice versa. - n_init = 10 if type(init) is str else 1 - km = Estimator(n_clusters=n_clusters, init=init, n_init=n_init, - random_state=0) +def test_result_equal_in_diff_n_threads(Estimator): + # Check that KMeans/MiniBatchKMeans give the same results in parallel mode + # than in sequential mode. + rnd = np.random.RandomState(0) + X = rnd.normal(size=(50, 10)) - km.fit(X_csr) - assert_array_equal(km.predict(X), km.labels_) + with threadpool_limits(limits=1, user_api="openmp"): + result_1 = Estimator( + n_clusters=n_clusters, random_state=0).fit(X).labels_ + with threadpool_limits(limits=2, user_api="openmp"): + result_2 = Estimator( + n_clusters=n_clusters, random_state=0).fit(X).labels_ + assert_array_equal(result_1, result_2) - km.fit(X) - assert_array_equal(km.predict(X_csr), km.labels_) +@pytest.mark.parametrize("precompute_distances", ["auto", False, True]) +def test_precompute_distance_deprecated(precompute_distances): + # FIXME: remove in 0.25 + depr_msg = ("'precompute_distances' was deprecated in version 0.23 and " + "will be removed in 0.25.") + X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) + kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0, + precompute_distances=precompute_distances) -@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) -def test_transform(Estimator): - # Check the transform method - km = Estimator(n_clusters=n_clusters).fit(X) + with pytest.warns(FutureWarning, match=depr_msg): + kmeans.fit(X) - # Transorfming cluster_centers_ should return the pairwise distances - # between centers - Xt = km.transform(km.cluster_centers_) - assert_allclose(Xt, pairwise_distances(km.cluster_centers_)) - # In particular, diagonal must be 0 - assert_array_equal(Xt.diagonal(), np.zeros(n_clusters)) - # Transorfming X should return the pairwise distances between X and the - # centers - Xt = km.transform(X) - assert_allclose(Xt, pairwise_distances(X, km.cluster_centers_)) +@pytest.mark.parametrize("n_jobs", [None, 1]) +def test_n_jobs_deprecated(n_jobs): + # FIXME: remove in 0.25 + depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed " + "in 0.25.") + X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) + kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0, + n_jobs=n_jobs) + + with pytest.warns(FutureWarning, match=depr_msg): + kmeans.fit(X) @pytest.mark.parametrize("attr", ["counts_", "init_size_", "random_state_"]) @@ -709,25 +793,6 @@ def test_warning_elkan_1_cluster(): KMeans(n_clusters=1, algorithm="elkan").fit(X) -@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) -def test_fit_transform(Estimator): - # Check equivalence between fit.transform and fit_transform - X1 = Estimator(random_state=0, n_init=1).fit(X).transform(X) - X2 = Estimator(random_state=0, n_init=1).fit_transform(X) - assert_allclose(X1, X2) - - -@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) -def test_sample_weight_unchanged(Estimator): - # Check that sample_weight is not modified in place by KMeans (#17204) - X = np.array([[1], [2], [4]]) - sample_weight = np.array([0.5, 0.2, 0.3]) - Estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight) - - # internally, sample_weight is rescale to sum up to n_samples = 3 - assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3])) - - @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], ids=["dense", "sparse"]) @pytest.mark.parametrize("algo", ["full", "elkan"]) @@ -757,40 +822,6 @@ def py_kmeans(X, init): assert_allclose(py_centers, cy_centers) -@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) -def test_kmeans_init_fitted_centers(data): - # Check that starting fitting from a local optimum shouldn't change the - # solution - km1 = KMeans(n_clusters=n_clusters).fit(data) - km2 = KMeans(n_clusters=n_clusters, init=km1.cluster_centers_, - n_init=1).fit(data) - - assert_allclose(km1.cluster_centers_, km2.cluster_centers_) - - -def test_kmeans_elkan_iter_attribute(): - # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off - # it's right value (#11340). - km = KMeans(algorithm="elkan", max_iter=1).fit(X) - assert km.n_iter_ == 1 - - -@pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], - ids=["dense", "sparse"]) -def test_kmeans_empty_cluster_relocated(array_constr): - # check that empty clusters are correctly relocated when using sample - # weights (#13486) - X = array_constr([[-1], [1]]) - sample_weight = [1.9, 0.1] - init = np.array([[-1], [10]]) - - km = KMeans(n_clusters=2, init=init, n_init=1) - km.fit(X, sample_weight=sample_weight) - - assert len(set(km.labels_)) == 2 - assert_allclose(km.cluster_centers_, [[-1], [1]]) - - @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("squared", [True, False]) def test_euclidean_distance(dtype, squared): @@ -839,6 +870,17 @@ def test_inertia(dtype): assert_allclose(inertia_sparse, expected, rtol=1e-6) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_sample_weight_unchanged(Estimator): + # Check that sample_weight is not modified in place by KMeans (#17204) + X = np.array([[1], [2], [4]]) + sample_weight = np.array([0.5, 0.2, 0.3]) + Estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight) + + # internally, sample_weight is rescale to sum up to n_samples = 3 + assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3])) + + @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) @pytest.mark.parametrize("param, match", [ ({"n_init": 0}, r"n_init should be > 0"), @@ -888,45 +930,3 @@ def test_minibatch_kmeans_wrong_params(param, match): # are passed for the MiniBatchKMeans specific parameters with pytest.raises(ValueError, match=match): MiniBatchKMeans(**param).fit(X) - - -@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) -def test_result_equal_in_diff_n_threads(Estimator): - # Check that KMeans/MiniBatchKMeans give the same results in parallel mode - # than in sequential mode. - rnd = np.random.RandomState(0) - X = rnd.normal(size=(50, 10)) - - with threadpool_limits(limits=1, user_api="openmp"): - result_1 = Estimator( - n_clusters=n_clusters, random_state=0).fit(X).labels_ - with threadpool_limits(limits=2, user_api="openmp"): - result_2 = Estimator( - n_clusters=n_clusters, random_state=0).fit(X).labels_ - assert_array_equal(result_1, result_2) - - -@pytest.mark.parametrize("precompute_distances", ["auto", False, True]) -def test_precompute_distance_deprecated(precompute_distances): - # FIXME: remove in 0.25 - depr_msg = ("'precompute_distances' was deprecated in version 0.23 and " - "will be removed in 0.25.") - X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) - kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0, - precompute_distances=precompute_distances) - - with pytest.warns(FutureWarning, match=depr_msg): - kmeans.fit(X) - - -@pytest.mark.parametrize("n_jobs", [None, 1]) -def test_n_jobs_deprecated(n_jobs): - # FIXME: remove in 0.25 - depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed " - "in 0.25.") - X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) - kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0, - n_jobs=n_jobs) - - with pytest.warns(FutureWarning, match=depr_msg): - kmeans.fit(X) From d2d68322b2af6c251503c3aec9d9c9b321e7ba4d Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 17 Jul 2020 15:35:32 +0200 Subject: [PATCH 38/72] cln --- sklearn/cluster/tests/test_k_means.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index b50fd79eabc33..93eb1a9679c33 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -19,8 +19,8 @@ from sklearn.metrics.cluster import v_measure_score from sklearn.cluster import KMeans, k_means from sklearn.cluster import MiniBatchKMeans -from sklearn.cluster._kmeans import _mini_batch_step from sklearn.cluster._kmeans import _labels_inertia +from sklearn.cluster._kmeans import _mini_batch_step from sklearn.cluster._k_means_common import _relocate_empty_clusters_dense from sklearn.cluster._k_means_common import _relocate_empty_clusters_sparse from sklearn.cluster._k_means_common import _euclidean_dense_dense_wrapper @@ -70,8 +70,8 @@ def test_kmeans_results(array_constr, algo, dtype): @pytest.mark.parametrize("array_constr", [np.array, sp.csr_matrix], - ids=["dense", "sparse"]) -@pytest.mark.parametrize("algo", ["full", "elkan"]) + ids=['dense', 'sparse']) +@pytest.mark.parametrize("algo", ['full', 'elkan']) def test_kmeans_relocated_clusters(array_constr, algo): # check that empty clusters are relocated as expected X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) @@ -560,6 +560,17 @@ def test_fit_transform(Estimator): assert_allclose(X1, X2) +def test_n_init(): + # Check that increasing the number of init increases the quality + previous_inertia = np.inf + for n_init in [1, 5, 10]: + # set max_iter=1 to avoid finding the global minimum and get the same + # inertia each time + km = KMeans(n_clusters=n_clusters, init="random", n_init=n_init, + random_state=0, max_iter=1).fit(X) + assert km.inertia_ <= previous_inertia + + def test_k_means_function(): # test calling the k_means function directly cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, @@ -752,7 +763,7 @@ def test_precompute_distance_deprecated(precompute_distances): depr_msg = ("'precompute_distances' was deprecated in version 0.23 and " "will be removed in 0.25.") X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) - kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0, + kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, precompute_distances=precompute_distances) with pytest.warns(FutureWarning, match=depr_msg): @@ -765,7 +776,7 @@ def test_n_jobs_deprecated(n_jobs): depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed " "in 0.25.") X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) - kmeans = KMeans(n_clusters=2, n_init=1, init="random", random_state=0, + kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, n_jobs=n_jobs) with pytest.warns(FutureWarning, match=depr_msg): @@ -877,7 +888,6 @@ def test_sample_weight_unchanged(Estimator): sample_weight = np.array([0.5, 0.2, 0.3]) Estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight) - # internally, sample_weight is rescale to sum up to n_samples = 3 assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3])) From 690f5b9ed37a6bf3731c72edc8ccbf5c685176eb Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 17 Jul 2020 15:40:59 +0200 Subject: [PATCH 39/72] cln --- sklearn/cluster/_kmeans.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 8b0bbe257c08a..ad0c5b49eb6b4 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -548,7 +548,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, The resulting assignment. inertia : float - Sum of squared distances of samples to their closest cluster center + Sum of squared distances of samples to their closest cluster center. """ n_samples = X.shape[0] n_clusters = centers.shape[0] @@ -950,7 +950,7 @@ def fit(self, X, y=None, sample_weight=None): # The copy was already done above X -= X_mean - if hasattr(self.init, '__array__'): + if hasattr(init, '__array__'): init -= X_mean # precompute squared norms of data points From 158aeed9806d7170f5cc5a464c73a04988392764 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Wed, 22 Jul 2020 12:32:32 +0200 Subject: [PATCH 40/72] cln --- sklearn/cluster/_kmeans.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index ad0c5b49eb6b4..04d323f2258ac 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -32,8 +32,8 @@ from ..exceptions import ConvergenceWarning from ._k_means_common import _inertia_dense from ._k_means_common import _inertia_sparse -from ._k_means_minibatch import _minibatch_update_sparse from ._k_means_minibatch import _minibatch_update_dense +from ._k_means_minibatch import _minibatch_update_sparse from ._k_means_lloyd import lloyd_iter_chunked_dense from ._k_means_lloyd import lloyd_iter_chunked_sparse from ._k_means_elkan import init_bounds_dense @@ -45,8 +45,7 @@ ############################################################################### # Initialization heuristic -def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, - n_local_trials=None): +def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): """Init n_clusters seeds according to k-means++ Parameters @@ -884,9 +883,8 @@ def _init_centroids(self, X, x_squared_norms, init, random_state, n_samples = X.shape[0] if isinstance(init, str) and init == 'k-means++': - centers = _kmeans_plusplus(X, n_clusters, - random_state=random_state, - x_squared_norms=x_squared_norms) + centers = _k_init(X, n_clusters, random_state=random_state, + x_squared_norms=x_squared_norms) elif isinstance(init, str) and init == 'random': seeds = random_state.permutation(n_samples)[:n_clusters] centers = X[seeds] @@ -1350,9 +1348,6 @@ class MiniBatchKMeans(KMeans): defined as the sum of square distances of samples to their cluster center, weighted by the sample weights if provided. - n_iter_ : int - Number of iterations run. - n_iter_ : int Number of batches processed. From e11cedb89be63ed77478610386fcff5754eea0d6 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Thu, 23 Jul 2020 13:23:27 +0200 Subject: [PATCH 41/72] wip --- sklearn/cluster/_k_means_minibatch.pyx | 43 ++++++++++------------ sklearn/cluster/_kmeans.py | 50 ++++---------------------- 2 files changed, 25 insertions(+), 68 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index 49af1c7426d0a..60d10c47c320b 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -7,9 +7,7 @@ cimport numpy as np from cython cimport floating from cython.parallel cimport parallel, prange -from libc.math cimport sqrt from libc.stdlib cimport malloc, free -from libc.string cimport memcpy np.import_array() @@ -53,22 +51,23 @@ def _minibatch_update_dense( cdef: int n_samples = X.shape[0] int n_clusters = centers_old.shape[0] - int i + int cluster_idx int *indices with nogil, parallel(num_threads=n_threads): indices = malloc(n_samples * sizeof(int)) - for i in prange(n_clusters, schedule="static"): - update_center_dense(i, &X[0, 0], sample_weight, centers_old, - centers_new, weight_sums, labels, indices) + for cluster_idx in prange(n_clusters, schedule="static"): + update_center_dense(cluster_idx, &X[0, 0], sample_weight, + centers_old, centers_new, weight_sums, labels, + indices) free(indices) cdef void update_center_dense( - int i, + int cluster_idx, floating *X, # IN floating[::1] sample_weight, # IN floating[:, ::1] centers_old, # IN @@ -89,36 +88,34 @@ cdef void update_center_dense( # indices = np.where(labels == i)[0] k = 0 for j in range(n_samples): - if labels[j] == i: + if labels[j] == cluster_idx: indices[k] = j + wsum += sample_weight[j] k += 1 n_indices = k - for j in range(n_indices): - idx = indices[j] - wsum += sample_weight[idx] - if wsum > 0: - # Remove previous count scaling + # Undo the previous count-based scaling for this cluster center for k in range(n_features): - centers_new[i, k] = centers_old[i, k] * weight_sums[i] + centers_new[cluster_idx, k] = centers_old[cluster_idx, k] * weight_sums[cluster_idx] # Update cluster with new point members for j in range(n_indices): idx = indices[j] for k in range(n_features): - centers_new[i, k] += X[idx * n_features + k] * sample_weight[idx] + centers_new[cluster_idx, k] += X[idx * n_features + k] * sample_weight[idx] # Update the count statistics for this center - weight_sums[i] += wsum + weight_sums[cluster_idx] += wsum # Rescale to compute mean of all points (old and new) - alpha = 1 / weight_sums[i] + alpha = 1 / weight_sums[cluster_idx] for k in range(n_features): - centers_new[i, k] *= alpha + centers_new[cluster_idx, k] *= alpha else: + # No sample was assigned to this cluster in this batch of data for k in range(n_features): - centers_new[i, k] = centers_old[i, k] + centers_new[cluster_idx, k] = centers_old[cluster_idx, k] def _minibatch_update_sparse( @@ -203,15 +200,12 @@ cdef void update_center_sparse( for j in range(n_samples): if labels[j] == i: indices[k] = j + wsum += sample_weight[j] k += 1 n_indices = k - for j in range(n_indices): - idx = indices[j] - wsum += sample_weight[idx] - if wsum > 0: - # Remove previous count scaling + # Undo the previous count-based scaling for this cluster center: for k in range(n_features): centers_new[i, k] = centers_old[i, k] * weight_sums[i] @@ -229,5 +223,6 @@ cdef void update_center_sparse( for k in range(n_features): centers_new[i, k] *= alpha else: + # No sample was assigned to this cluster in this batch of data for k in range(n_features): centers_new[i, k] = centers_old[i, k] diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 04d323f2258ac..b85d2a34a314e 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -24,7 +24,6 @@ from ..utils.sparsefuncs import mean_variance_axis from ..utils.validation import _deprecate_positional_args from ..utils import check_array -from ..utils import gen_batches from ..utils import check_random_state from ..utils import deprecated from ..utils.validation import check_is_fitted, _check_sample_weight @@ -1476,45 +1475,6 @@ def _check_params(self, X): f"reassignment_ratio should be >= 0, got " f"{self.reassignment_ratio} instead.") - def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms, - centers): - """Compute labels and inertia using mini batches. - - This is slightly slower than doing everything at once but preventes - memory errors / segfaults. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data. - - sample_weight : ndarray of shape (n_samples,) - The weights for each observation in X. - - x_squared_norms : ndarray of shape (n_samples,) - Precomputed squared euclidean norm of each data point, to speed up - computations. - - centers : ndarray of shape (n_clusters, n_features) - The cluster centers. - - Returns - ------- - labels : ndarray of shape (n_samples,) - Cluster labels for each point. - - inertia : float - Sum of squared distances of points to nearest cluster. - """ - if self.verbose: - print('Computing label assignment and total inertia') - slices = gen_batches(X.shape[0], self.batch_size) - results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s], - centers, n_threads=self._n_threads) - for s in slices] - labels, inertia = zip(*results) - return np.hstack(labels), np.sum(inertia) - def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, centers_squared_diff, batch_inertia): """Helper function to encapsulate the early stopping logic""" @@ -1708,8 +1668,9 @@ def fit(self, X, y=None, sample_weight=None): self.n_iter_ = i + 1 if self.compute_labels: - self.labels_, self.inertia_ = self._labels_inertia_minibatch( - X, sample_weight, x_squared_norms, self.cluster_centers_) + self.labels_, self.inertia_ = _labels_inertia( + X, sample_weight, x_squared_norms, self.cluster_centers_, + n_threads=self._n_threads) return self @@ -1819,8 +1780,9 @@ def predict(self, X, sample_weight=None): x_squared_norms = row_norms(X, squared=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - return self._labels_inertia_minibatch( - X, sample_weight, x_squared_norms, self.cluster_centers_)[0] + return _labels_inertia( + X, sample_weight, x_squared_norms, self.cluster_centers_, + n_threads=self._n_threads)[0] def _more_tags(self): return { From bbdabf540adf85994ed4b7b9434bc9db0a0020d9 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 24 Jul 2020 14:06:15 +0200 Subject: [PATCH 42/72] threadpool-limit protection --- sklearn/cluster/_kmeans.py | 131 +++++++++++++++++++++---------------- 1 file changed, 74 insertions(+), 57 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index b85d2a34a314e..e140549ed99ff 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -573,6 +573,16 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, return labels, inertia +def _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms, + centers, n_threads=None): + """Same as _labels_inertia but in a threadpool_limits context.""" + with threadpool_limits(limits=1, user_api="blas"): + labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms, + centers, n_threads) + + return labels, inertia + + class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): """K-Means clustering. @@ -1102,8 +1112,9 @@ def predict(self, X, sample_weight=None): x_squared_norms = row_norms(X, squared=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - return _labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_, self._n_threads)[0] + return _labels_inertia_threadpool_limit( + X, sample_weight, x_squared_norms, self.cluster_centers_, + self._n_threads)[0] def score(self, X, y=None, sample_weight=None): """Opposite of the value of X on the K-means objective. @@ -1131,8 +1142,9 @@ def score(self, X, y=None, sample_weight=None): x_squared_norms = row_norms(X, squared=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - return -_labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_, self._n_threads)[1] + return -_labels_inertia_threadpool_limit( + X, sample_weight, x_squared_norms, self.cluster_centers_, + self._n_threads)[1] def _more_tags(self): return { @@ -1600,7 +1612,7 @@ def fit(self, X, y=None, sample_weight=None): random_state=random_state, init_size=self._init_size) # Compute inertia on a validation set. - _, inertia = _labels_inertia( + _, inertia = _labels_inertia_threadpool_limit( X_valid, sample_weight_valid, x_squared_norms_valid, cluster_centers, n_threads=self._n_threads) @@ -1625,50 +1637,52 @@ def fit(self, X, y=None, sample_weight=None): n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) - # Perform the iterative optimization until convergence - for i in range(n_iter): - # Sample a minibatch from the full dataset - minibatch_indices = random_state.randint(0, n_samples, - self.batch_size) - - # Randomly choose whether to perform random reassignment: - # the choice is done as a function of the iteration index, and the - # minimum number of counts, in order to force this reassignment to - # happen every once in a while. - random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0 - - # Perform the actual update step on the minibatch data - batch_inertia = _mini_batch_step( - X=X[minibatch_indices], - x_squared_norms=x_squared_norms[minibatch_indices], - sample_weight=sample_weight[minibatch_indices], - centers=centers, - centers_new=centers_new, - weight_sums=self._counts, - random_state=random_state, - random_reassign=random_reassign, - reassignment_ratio=self.reassignment_ratio, - verbose=self.verbose, - n_threads=self._n_threads) - - if self._tol > 0.0: - centers_squared_diff = np.sum((centers_new - centers)**2) - else: - centers_squared_diff = 0 - - centers, centers_new = centers_new, centers - - # Monitor convergence and do early stopping if necessary - if self._mini_batch_convergence( - i, n_iter, n_samples, centers_squared_diff, batch_inertia): - break + with threadpool_limits(limits=1, user_api="blas"): + # Perform the iterative optimization until convergence + for i in range(n_iter): + # Sample a minibatch from the full dataset + minibatch_indices = random_state.randint(0, n_samples, + self.batch_size) + + # Randomly choose whether to perform random reassignment: + # the choice is done as a function of the iteration index, and + # the minimum number of counts, in order to force this + # reassignment to happen every once in a while. + random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0 + + # Perform the actual update step on the minibatch data + batch_inertia = _mini_batch_step( + X=X[minibatch_indices], + x_squared_norms=x_squared_norms[minibatch_indices], + sample_weight=sample_weight[minibatch_indices], + centers=centers, + centers_new=centers_new, + weight_sums=self._counts, + random_state=random_state, + random_reassign=random_reassign, + reassignment_ratio=self.reassignment_ratio, + verbose=self.verbose, + n_threads=self._n_threads) + + if self._tol > 0.0: + centers_squared_diff = np.sum((centers_new - centers)**2) + else: + centers_squared_diff = 0 + + centers, centers_new = centers_new, centers + + # Monitor convergence and do early stopping if necessary + if self._mini_batch_convergence( + i, n_iter, n_samples, centers_squared_diff, + batch_inertia): + break self.cluster_centers_ = centers self.n_iter_ = i + 1 if self.compute_labels: - self.labels_, self.inertia_ = _labels_inertia( + self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( X, sample_weight, x_squared_norms, self.cluster_centers_, n_threads=self._n_threads) @@ -1734,20 +1748,21 @@ def partial_fit(self, X, y=None, sample_weight=None): random_reassign = self._random_state.randint( 10 * (1 + self._counts.min())) == 0 - _mini_batch_step(X, - x_squared_norms=x_squared_norms, - sample_weight=sample_weight, - centers=self.cluster_centers_, - centers_new=self.cluster_centers_, - weight_sums=self._counts, - random_state=self._random_state, - random_reassign=random_reassign, - reassignment_ratio=self.reassignment_ratio, - verbose=self.verbose, - n_threads=self._n_threads) + with threadpool_limits(limits=1, user_api="blas"): + _mini_batch_step(X, + x_squared_norms=x_squared_norms, + sample_weight=sample_weight, + centers=self.cluster_centers_, + centers_new=self.cluster_centers_, + weight_sums=self._counts, + random_state=self._random_state, + random_reassign=random_reassign, + reassignment_ratio=self.reassignment_ratio, + verbose=self.verbose, + n_threads=self._n_threads) if self.compute_labels: - self.labels_, self.inertia_ = _labels_inertia( + self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( X, sample_weight, x_squared_norms, self.cluster_centers_, n_threads=self._n_threads) @@ -1780,9 +1795,11 @@ def predict(self, X, sample_weight=None): x_squared_norms = row_norms(X, squared=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - return _labels_inertia( + labels, _ = _labels_inertia_threadpool_limit( X, sample_weight, x_squared_norms, self.cluster_centers_, - n_threads=self._n_threads)[0] + n_threads=self._n_threads) + + return labels def _more_tags(self): return { From 53691e454bd55fb01f96bf6e2df2274bbd3c28ce Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 24 Jul 2020 14:36:45 +0200 Subject: [PATCH 43/72] idx --- sklearn/cluster/_k_means_minibatch.pyx | 30 +++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index 60d10c47c320b..6476336a4078b 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -81,41 +81,41 @@ cdef void update_center_dense( int n_features = centers_old.shape[1] floating alpha, tmp int n_indices - int j, k, idx + int k, sample_idx, feature_idx floating wsum = 0 # indices = np.where(labels == i)[0] k = 0 - for j in range(n_samples): - if labels[j] == cluster_idx: - indices[k] = j - wsum += sample_weight[j] + for sample_idx in range(n_samples): + if labels[sample_idx] == cluster_idx: + indices[k] = sample_idx + wsum += sample_weight[sample_idx] k += 1 n_indices = k if wsum > 0: # Undo the previous count-based scaling for this cluster center - for k in range(n_features): - centers_new[cluster_idx, k] = centers_old[cluster_idx, k] * weight_sums[cluster_idx] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx] # Update cluster with new point members - for j in range(n_indices): - idx = indices[j] - for k in range(n_features): - centers_new[cluster_idx, k] += X[idx * n_features + k] * sample_weight[idx] + for k in range(n_indices): + sample_idx = indices[k] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] += X[sample_idx * n_features + feature_idx] * sample_weight[sample_idx] # Update the count statistics for this center weight_sums[cluster_idx] += wsum # Rescale to compute mean of all points (old and new) alpha = 1 / weight_sums[cluster_idx] - for k in range(n_features): - centers_new[cluster_idx, k] *= alpha + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] *= alpha else: # No sample was assigned to this cluster in this batch of data - for k in range(n_features): - centers_new[cluster_idx, k] = centers_old[cluster_idx, k] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] def _minibatch_update_sparse( From 24a267fb8317ba55bf198ffc6dd2ba9c64e583fe Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 24 Jul 2020 14:49:21 +0200 Subject: [PATCH 44/72] random_reassign --- sklearn/cluster/_kmeans.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index e140549ed99ff..8c6f119c398bc 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1648,7 +1648,8 @@ def fit(self, X, y=None, sample_weight=None): # the choice is done as a function of the iteration index, and # the minimum number of counts, in order to force this # reassignment to happen every once in a while. - random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0 + random_reassign = random_state.randint( + 10 * (1 + self._counts.min())) == 0 # Perform the actual update step on the minibatch data batch_inertia = _mini_batch_step( From 146a93b96ee1a1a31e76451369978d7b752e3c93 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Thu, 23 Jul 2020 13:23:27 +0200 Subject: [PATCH 45/72] wip --- sklearn/cluster/_k_means_minibatch.pyx | 43 ++++++++++------------ sklearn/cluster/_kmeans.py | 50 ++++---------------------- 2 files changed, 25 insertions(+), 68 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index 49af1c7426d0a..60d10c47c320b 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -7,9 +7,7 @@ cimport numpy as np from cython cimport floating from cython.parallel cimport parallel, prange -from libc.math cimport sqrt from libc.stdlib cimport malloc, free -from libc.string cimport memcpy np.import_array() @@ -53,22 +51,23 @@ def _minibatch_update_dense( cdef: int n_samples = X.shape[0] int n_clusters = centers_old.shape[0] - int i + int cluster_idx int *indices with nogil, parallel(num_threads=n_threads): indices = malloc(n_samples * sizeof(int)) - for i in prange(n_clusters, schedule="static"): - update_center_dense(i, &X[0, 0], sample_weight, centers_old, - centers_new, weight_sums, labels, indices) + for cluster_idx in prange(n_clusters, schedule="static"): + update_center_dense(cluster_idx, &X[0, 0], sample_weight, + centers_old, centers_new, weight_sums, labels, + indices) free(indices) cdef void update_center_dense( - int i, + int cluster_idx, floating *X, # IN floating[::1] sample_weight, # IN floating[:, ::1] centers_old, # IN @@ -89,36 +88,34 @@ cdef void update_center_dense( # indices = np.where(labels == i)[0] k = 0 for j in range(n_samples): - if labels[j] == i: + if labels[j] == cluster_idx: indices[k] = j + wsum += sample_weight[j] k += 1 n_indices = k - for j in range(n_indices): - idx = indices[j] - wsum += sample_weight[idx] - if wsum > 0: - # Remove previous count scaling + # Undo the previous count-based scaling for this cluster center for k in range(n_features): - centers_new[i, k] = centers_old[i, k] * weight_sums[i] + centers_new[cluster_idx, k] = centers_old[cluster_idx, k] * weight_sums[cluster_idx] # Update cluster with new point members for j in range(n_indices): idx = indices[j] for k in range(n_features): - centers_new[i, k] += X[idx * n_features + k] * sample_weight[idx] + centers_new[cluster_idx, k] += X[idx * n_features + k] * sample_weight[idx] # Update the count statistics for this center - weight_sums[i] += wsum + weight_sums[cluster_idx] += wsum # Rescale to compute mean of all points (old and new) - alpha = 1 / weight_sums[i] + alpha = 1 / weight_sums[cluster_idx] for k in range(n_features): - centers_new[i, k] *= alpha + centers_new[cluster_idx, k] *= alpha else: + # No sample was assigned to this cluster in this batch of data for k in range(n_features): - centers_new[i, k] = centers_old[i, k] + centers_new[cluster_idx, k] = centers_old[cluster_idx, k] def _minibatch_update_sparse( @@ -203,15 +200,12 @@ cdef void update_center_sparse( for j in range(n_samples): if labels[j] == i: indices[k] = j + wsum += sample_weight[j] k += 1 n_indices = k - for j in range(n_indices): - idx = indices[j] - wsum += sample_weight[idx] - if wsum > 0: - # Remove previous count scaling + # Undo the previous count-based scaling for this cluster center: for k in range(n_features): centers_new[i, k] = centers_old[i, k] * weight_sums[i] @@ -229,5 +223,6 @@ cdef void update_center_sparse( for k in range(n_features): centers_new[i, k] *= alpha else: + # No sample was assigned to this cluster in this batch of data for k in range(n_features): centers_new[i, k] = centers_old[i, k] diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 04d323f2258ac..b85d2a34a314e 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -24,7 +24,6 @@ from ..utils.sparsefuncs import mean_variance_axis from ..utils.validation import _deprecate_positional_args from ..utils import check_array -from ..utils import gen_batches from ..utils import check_random_state from ..utils import deprecated from ..utils.validation import check_is_fitted, _check_sample_weight @@ -1476,45 +1475,6 @@ def _check_params(self, X): f"reassignment_ratio should be >= 0, got " f"{self.reassignment_ratio} instead.") - def _labels_inertia_minibatch(self, X, sample_weight, x_squared_norms, - centers): - """Compute labels and inertia using mini batches. - - This is slightly slower than doing everything at once but preventes - memory errors / segfaults. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data. - - sample_weight : ndarray of shape (n_samples,) - The weights for each observation in X. - - x_squared_norms : ndarray of shape (n_samples,) - Precomputed squared euclidean norm of each data point, to speed up - computations. - - centers : ndarray of shape (n_clusters, n_features) - The cluster centers. - - Returns - ------- - labels : ndarray of shape (n_samples,) - Cluster labels for each point. - - inertia : float - Sum of squared distances of points to nearest cluster. - """ - if self.verbose: - print('Computing label assignment and total inertia') - slices = gen_batches(X.shape[0], self.batch_size) - results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s], - centers, n_threads=self._n_threads) - for s in slices] - labels, inertia = zip(*results) - return np.hstack(labels), np.sum(inertia) - def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, centers_squared_diff, batch_inertia): """Helper function to encapsulate the early stopping logic""" @@ -1708,8 +1668,9 @@ def fit(self, X, y=None, sample_weight=None): self.n_iter_ = i + 1 if self.compute_labels: - self.labels_, self.inertia_ = self._labels_inertia_minibatch( - X, sample_weight, x_squared_norms, self.cluster_centers_) + self.labels_, self.inertia_ = _labels_inertia( + X, sample_weight, x_squared_norms, self.cluster_centers_, + n_threads=self._n_threads) return self @@ -1819,8 +1780,9 @@ def predict(self, X, sample_weight=None): x_squared_norms = row_norms(X, squared=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - return self._labels_inertia_minibatch( - X, sample_weight, x_squared_norms, self.cluster_centers_)[0] + return _labels_inertia( + X, sample_weight, x_squared_norms, self.cluster_centers_, + n_threads=self._n_threads)[0] def _more_tags(self): return { From 412864f256a77d1366f6baa4c88d617cadc8f2d9 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 24 Jul 2020 14:06:15 +0200 Subject: [PATCH 46/72] threadpool-limit protection --- sklearn/cluster/_kmeans.py | 131 +++++++++++++++++++++---------------- 1 file changed, 74 insertions(+), 57 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index b85d2a34a314e..e140549ed99ff 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -573,6 +573,16 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, return labels, inertia +def _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms, + centers, n_threads=None): + """Same as _labels_inertia but in a threadpool_limits context.""" + with threadpool_limits(limits=1, user_api="blas"): + labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms, + centers, n_threads) + + return labels, inertia + + class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): """K-Means clustering. @@ -1102,8 +1112,9 @@ def predict(self, X, sample_weight=None): x_squared_norms = row_norms(X, squared=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - return _labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_, self._n_threads)[0] + return _labels_inertia_threadpool_limit( + X, sample_weight, x_squared_norms, self.cluster_centers_, + self._n_threads)[0] def score(self, X, y=None, sample_weight=None): """Opposite of the value of X on the K-means objective. @@ -1131,8 +1142,9 @@ def score(self, X, y=None, sample_weight=None): x_squared_norms = row_norms(X, squared=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - return -_labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_, self._n_threads)[1] + return -_labels_inertia_threadpool_limit( + X, sample_weight, x_squared_norms, self.cluster_centers_, + self._n_threads)[1] def _more_tags(self): return { @@ -1600,7 +1612,7 @@ def fit(self, X, y=None, sample_weight=None): random_state=random_state, init_size=self._init_size) # Compute inertia on a validation set. - _, inertia = _labels_inertia( + _, inertia = _labels_inertia_threadpool_limit( X_valid, sample_weight_valid, x_squared_norms_valid, cluster_centers, n_threads=self._n_threads) @@ -1625,50 +1637,52 @@ def fit(self, X, y=None, sample_weight=None): n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) - # Perform the iterative optimization until convergence - for i in range(n_iter): - # Sample a minibatch from the full dataset - minibatch_indices = random_state.randint(0, n_samples, - self.batch_size) - - # Randomly choose whether to perform random reassignment: - # the choice is done as a function of the iteration index, and the - # minimum number of counts, in order to force this reassignment to - # happen every once in a while. - random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0 - - # Perform the actual update step on the minibatch data - batch_inertia = _mini_batch_step( - X=X[minibatch_indices], - x_squared_norms=x_squared_norms[minibatch_indices], - sample_weight=sample_weight[minibatch_indices], - centers=centers, - centers_new=centers_new, - weight_sums=self._counts, - random_state=random_state, - random_reassign=random_reassign, - reassignment_ratio=self.reassignment_ratio, - verbose=self.verbose, - n_threads=self._n_threads) - - if self._tol > 0.0: - centers_squared_diff = np.sum((centers_new - centers)**2) - else: - centers_squared_diff = 0 - - centers, centers_new = centers_new, centers - - # Monitor convergence and do early stopping if necessary - if self._mini_batch_convergence( - i, n_iter, n_samples, centers_squared_diff, batch_inertia): - break + with threadpool_limits(limits=1, user_api="blas"): + # Perform the iterative optimization until convergence + for i in range(n_iter): + # Sample a minibatch from the full dataset + minibatch_indices = random_state.randint(0, n_samples, + self.batch_size) + + # Randomly choose whether to perform random reassignment: + # the choice is done as a function of the iteration index, and + # the minimum number of counts, in order to force this + # reassignment to happen every once in a while. + random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0 + + # Perform the actual update step on the minibatch data + batch_inertia = _mini_batch_step( + X=X[minibatch_indices], + x_squared_norms=x_squared_norms[minibatch_indices], + sample_weight=sample_weight[minibatch_indices], + centers=centers, + centers_new=centers_new, + weight_sums=self._counts, + random_state=random_state, + random_reassign=random_reassign, + reassignment_ratio=self.reassignment_ratio, + verbose=self.verbose, + n_threads=self._n_threads) + + if self._tol > 0.0: + centers_squared_diff = np.sum((centers_new - centers)**2) + else: + centers_squared_diff = 0 + + centers, centers_new = centers_new, centers + + # Monitor convergence and do early stopping if necessary + if self._mini_batch_convergence( + i, n_iter, n_samples, centers_squared_diff, + batch_inertia): + break self.cluster_centers_ = centers self.n_iter_ = i + 1 if self.compute_labels: - self.labels_, self.inertia_ = _labels_inertia( + self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( X, sample_weight, x_squared_norms, self.cluster_centers_, n_threads=self._n_threads) @@ -1734,20 +1748,21 @@ def partial_fit(self, X, y=None, sample_weight=None): random_reassign = self._random_state.randint( 10 * (1 + self._counts.min())) == 0 - _mini_batch_step(X, - x_squared_norms=x_squared_norms, - sample_weight=sample_weight, - centers=self.cluster_centers_, - centers_new=self.cluster_centers_, - weight_sums=self._counts, - random_state=self._random_state, - random_reassign=random_reassign, - reassignment_ratio=self.reassignment_ratio, - verbose=self.verbose, - n_threads=self._n_threads) + with threadpool_limits(limits=1, user_api="blas"): + _mini_batch_step(X, + x_squared_norms=x_squared_norms, + sample_weight=sample_weight, + centers=self.cluster_centers_, + centers_new=self.cluster_centers_, + weight_sums=self._counts, + random_state=self._random_state, + random_reassign=random_reassign, + reassignment_ratio=self.reassignment_ratio, + verbose=self.verbose, + n_threads=self._n_threads) if self.compute_labels: - self.labels_, self.inertia_ = _labels_inertia( + self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( X, sample_weight, x_squared_norms, self.cluster_centers_, n_threads=self._n_threads) @@ -1780,9 +1795,11 @@ def predict(self, X, sample_weight=None): x_squared_norms = row_norms(X, squared=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - return _labels_inertia( + labels, _ = _labels_inertia_threadpool_limit( X, sample_weight, x_squared_norms, self.cluster_centers_, - n_threads=self._n_threads)[0] + n_threads=self._n_threads) + + return labels def _more_tags(self): return { From 421a0410993cbce4a4e2c8d80813c5af05b90d59 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 24 Jul 2020 14:36:45 +0200 Subject: [PATCH 47/72] idx --- sklearn/cluster/_k_means_minibatch.pyx | 30 +++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index 60d10c47c320b..6476336a4078b 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -81,41 +81,41 @@ cdef void update_center_dense( int n_features = centers_old.shape[1] floating alpha, tmp int n_indices - int j, k, idx + int k, sample_idx, feature_idx floating wsum = 0 # indices = np.where(labels == i)[0] k = 0 - for j in range(n_samples): - if labels[j] == cluster_idx: - indices[k] = j - wsum += sample_weight[j] + for sample_idx in range(n_samples): + if labels[sample_idx] == cluster_idx: + indices[k] = sample_idx + wsum += sample_weight[sample_idx] k += 1 n_indices = k if wsum > 0: # Undo the previous count-based scaling for this cluster center - for k in range(n_features): - centers_new[cluster_idx, k] = centers_old[cluster_idx, k] * weight_sums[cluster_idx] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx] # Update cluster with new point members - for j in range(n_indices): - idx = indices[j] - for k in range(n_features): - centers_new[cluster_idx, k] += X[idx * n_features + k] * sample_weight[idx] + for k in range(n_indices): + sample_idx = indices[k] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] += X[sample_idx * n_features + feature_idx] * sample_weight[sample_idx] # Update the count statistics for this center weight_sums[cluster_idx] += wsum # Rescale to compute mean of all points (old and new) alpha = 1 / weight_sums[cluster_idx] - for k in range(n_features): - centers_new[cluster_idx, k] *= alpha + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] *= alpha else: # No sample was assigned to this cluster in this batch of data - for k in range(n_features): - centers_new[cluster_idx, k] = centers_old[cluster_idx, k] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] def _minibatch_update_sparse( From a6862f8817d5fd7a08a05cfdf8c971b71350184c Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 24 Jul 2020 14:49:21 +0200 Subject: [PATCH 48/72] random_reassign --- sklearn/cluster/_kmeans.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index e140549ed99ff..8c6f119c398bc 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1648,7 +1648,8 @@ def fit(self, X, y=None, sample_weight=None): # the choice is done as a function of the iteration index, and # the minimum number of counts, in order to force this # reassignment to happen every once in a while. - random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0 + random_reassign = random_state.randint( + 10 * (1 + self._counts.min())) == 0 # Perform the actual update step on the minibatch data batch_inertia = _mini_batch_step( From 355627dd166a2316b1d07fcb64a7381c36b868a5 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Tue, 28 Jul 2020 12:09:49 +0200 Subject: [PATCH 49/72] wip --- sklearn/cluster/_kmeans.py | 19 ++++++++++++++++--- sklearn/cluster/tests/test_k_means.py | 7 ++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 8c6f119c398bc..9c6e88ef3201e 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1419,7 +1419,7 @@ class MiniBatchKMeans(KMeans): def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, batch_size=100, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, - init_size=None, n_init=3, reassignment_ratio=0.01): + init_size=None, n_init=3, reassignment_ratio=0.01, mode=0): super().__init__( n_clusters=n_clusters, init=init, max_iter=max_iter, @@ -1430,6 +1430,7 @@ def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, self.compute_labels = compute_labels self.init_size = init_size self.reassignment_ratio = reassignment_ratio + self.mode = mode @deprecated("The attribute 'counts_' is deprecated in 0.24" # type: ignore " and will be removed in 0.26.") @@ -1648,8 +1649,15 @@ def fit(self, X, y=None, sample_weight=None): # the choice is done as a function of the iteration index, and # the minimum number of counts, in order to force this # reassignment to happen every once in a while. - random_reassign = random_state.randint( - 10 * (1 + self._counts.min())) == 0 + + if self.mode == 0: + random_reassign = random_state.randint( + 10 * (1 + self._counts.min())) == 0 + elif self.mode == 1: + random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0 + elif self.mode == 2: + random_reassign = ((i >= 10) * + random_state.choice([0, 1], p=[0.1, 0.9])) # Perform the actual update step on the minibatch data batch_inertia = _mini_batch_step( @@ -1672,6 +1680,11 @@ def fit(self, X, y=None, sample_weight=None): centers, centers_new = centers_new, centers + _, inertiaa = _labels_inertia_threadpool_limit( + X, sample_weight, x_squared_norms, centers, + n_threads=self._n_threads) + print(f"{inertiaa},") + # Monitor convergence and do early stopping if necessary if self._mini_batch_convergence( i, n_iter, n_samples, centers_squared_diff, diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 93eb1a9679c33..0dbbed97ccfac 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -708,12 +708,13 @@ def test_unit_weights_vs_no_weights(Estimator, data): def test_scaled_weights(Estimator, data): # Check that scaling all sample weights by a common factor # shouldn't change the result - sample_weight = np.random.uniform(n_samples) + data = np.random.random_sample((100000, 10)) + sample_weight = np.random.RandomState(0).uniform(n_samples) - km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1) + km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1, max_no_improvement=None, init='random') km_orig = clone(km).fit(data, sample_weight=sample_weight) km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight) - + assert False assert_array_equal(km_orig.labels_, km_scaled.labels_) assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_) From d5a8935c90939ed34650c49907eb0f532ae3bf80 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Wed, 29 Jul 2020 13:13:36 +0200 Subject: [PATCH 50/72] wip --- sklearn/cluster/_kmeans.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 9c6e88ef3201e..fcc7b524e7498 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1638,6 +1638,8 @@ def fit(self, X, y=None, sample_weight=None): n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) + n_samples_seen_since_last_reassign = 0 + with threadpool_limits(limits=1, user_api="blas"): # Perform the iterative optimization until convergence for i in range(n_iter): @@ -1649,7 +1651,7 @@ def fit(self, X, y=None, sample_weight=None): # the choice is done as a function of the iteration index, and # the minimum number of counts, in order to force this # reassignment to happen every once in a while. - + if self.mode == 0: random_reassign = random_state.randint( 10 * (1 + self._counts.min())) == 0 @@ -1657,7 +1659,16 @@ def fit(self, X, y=None, sample_weight=None): random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0 elif self.mode == 2: random_reassign = ((i >= 10) * - random_state.choice([0, 1], p=[0.1, 0.9])) + random_state.choice([0, 1], p=[0.9, 0.1])) + elif self.mode == 3: + random_reassign = (i >= 10) * True + elif self.mode == 4: + random_reassign = True + elif isinstance(self.mode, tuple): + n_samples_seen_since_last_reassign += self.batch_size + random_reassign = n_samples_seen_since_last_reassign >= (self.mode[0] * self.n_clusters) + if random_reassign: + n_samples_seen_since_last_reassign = 0 # Perform the actual update step on the minibatch data batch_inertia = _mini_batch_step( From 4d29bc312140109f267a8d75777f063382933bb5 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 31 Jul 2020 15:37:06 +0200 Subject: [PATCH 51/72] wip --- sklearn/cluster/_kmeans.py | 54 +++++++++++--------------------------- 1 file changed, 16 insertions(+), 38 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index fcc7b524e7498..4b92b9b123c5e 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1551,6 +1551,18 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, self._no_improvement = no_improvement return False + def _random_reassign(self): + """Check if a random reassignment needs to be done. + + Do random reassignments each time 10 * n_clusters samples have been + processed. + """ + self._n_since_last_reassign += self.batch_size + if self._n_since_last_reassign >= (10 * self.n_clusters): + self._n_since_last_reassign = 0 + return True + return False + def fit(self, X, y=None, sample_weight=None): """Compute the centroids on X by chunking it into mini-batches. @@ -1638,7 +1650,7 @@ def fit(self, X, y=None, sample_weight=None): n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) - n_samples_seen_since_last_reassign = 0 + self._n_since_last_reassign = 0 with threadpool_limits(limits=1, user_api="blas"): # Perform the iterative optimization until convergence @@ -1647,29 +1659,6 @@ def fit(self, X, y=None, sample_weight=None): minibatch_indices = random_state.randint(0, n_samples, self.batch_size) - # Randomly choose whether to perform random reassignment: - # the choice is done as a function of the iteration index, and - # the minimum number of counts, in order to force this - # reassignment to happen every once in a while. - - if self.mode == 0: - random_reassign = random_state.randint( - 10 * (1 + self._counts.min())) == 0 - elif self.mode == 1: - random_reassign = (i + 1) % (10 + int(self._counts.min())) == 0 - elif self.mode == 2: - random_reassign = ((i >= 10) * - random_state.choice([0, 1], p=[0.9, 0.1])) - elif self.mode == 3: - random_reassign = (i >= 10) * True - elif self.mode == 4: - random_reassign = True - elif isinstance(self.mode, tuple): - n_samples_seen_since_last_reassign += self.batch_size - random_reassign = n_samples_seen_since_last_reassign >= (self.mode[0] * self.n_clusters) - if random_reassign: - n_samples_seen_since_last_reassign = 0 - # Perform the actual update step on the minibatch data batch_inertia = _mini_batch_step( X=X[minibatch_indices], @@ -1679,7 +1668,7 @@ def fit(self, X, y=None, sample_weight=None): centers_new=centers_new, weight_sums=self._counts, random_state=random_state, - random_reassign=random_reassign, + random_reassign=self._random_reassign(), reassignment_ratio=self.reassignment_ratio, verbose=self.verbose, n_threads=self._n_threads) @@ -1691,11 +1680,6 @@ def fit(self, X, y=None, sample_weight=None): centers, centers_new = centers_new, centers - _, inertiaa = _labels_inertia_threadpool_limit( - X, sample_weight, x_squared_norms, centers, - n_threads=self._n_threads) - print(f"{inertiaa},") - # Monitor convergence and do early stopping if necessary if self._mini_batch_convergence( i, n_iter, n_samples, centers_squared_diff, @@ -1765,13 +1749,7 @@ def partial_fit(self, X, y=None, sample_weight=None): # Initialize counts self._counts = np.zeros(self.n_clusters, dtype=X.dtype) - random_reassign = False - else: - # The lower the minimum count is, the more we do random - # reassignment, however, we don't want to do random - # reassignment too often, to allow for building up counts - random_reassign = self._random_state.randint( - 10 * (1 + self._counts.min())) == 0 + self._n_since_last_reassign = 0 with threadpool_limits(limits=1, user_api="blas"): _mini_batch_step(X, @@ -1781,7 +1759,7 @@ def partial_fit(self, X, y=None, sample_weight=None): centers_new=self.cluster_centers_, weight_sums=self._counts, random_state=self._random_state, - random_reassign=random_reassign, + random_reassign=self._random_reassign(), reassignment_ratio=self.reassignment_ratio, verbose=self.verbose, n_threads=self._n_threads) From de3180e4ea4b3aba2b633831a3e7ddf6a07209da Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 31 Jul 2020 16:10:05 +0200 Subject: [PATCH 52/72] wip --- sklearn/cluster/_kmeans.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 2703dfc8fc4ea..5fd5d3c566c85 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1577,9 +1577,12 @@ def _random_reassign(self): Do random reassignments each time 10 * n_clusters samples have been processed. + + If there are empty clusters we always want to reassign. """ self._n_since_last_reassign += self.batch_size - if self._n_since_last_reassign >= (10 * self.n_clusters): + if ((self._counts == 0).any() or + self._n_since_last_reassign >= (10 * self.n_clusters)): self._n_since_last_reassign = 0 return True return False From a3b55b781e0f29f8bbf7d45bbc5f5a30534f29fe Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 31 Jul 2020 16:11:11 +0200 Subject: [PATCH 53/72] ellipsis --- sklearn/cluster/_kmeans.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 5fd5d3c566c85..5e244f2340fba 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1431,8 +1431,8 @@ class MiniBatchKMeans(KMeans): ... batch_size=6, ... max_iter=10).fit(X) >>> kmeans.cluster_centers_ - array([[2.32394366, 1.16901408], - [3.4 , 4.36 ]]) + array([[2.3..., 1.1...], + [3.4..., 4.3...]]) >>> kmeans.predict([[0, 0], [4, 4]]) array([0, 1], dtype=int32) """ From 4d06cc282a5c8d57aaaf14b9236e77fa5ed76cef Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 31 Jul 2020 16:19:33 +0200 Subject: [PATCH 54/72] idx --- sklearn/cluster/_k_means_minibatch.pyx | 46 +++++++++++++------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index 6476336a4078b..942c058b07a92 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -159,23 +159,23 @@ def _minibatch_update_sparse( int[::1] X_indptr = X.indptr int n_samples = X.shape[0] int n_clusters = centers_old.shape[0] - int i + int cluster_idx int *indices with nogil, parallel(num_threads=n_threads): indices = malloc(n_samples * sizeof(int)) - for i in prange(n_clusters, schedule="static"): - update_center_sparse(i, X_data, X_indices, X_indptr, sample_weight, - centers_old, centers_new, weight_sums, labels, - indices) + for cluster_idx in prange(n_clusters, schedule="static"): + update_center_sparse(cluster_idx, X_data, X_indices, X_indptr, + sample_weight, centers_old, centers_new, + weight_sums, labels, indices) free(indices) cdef void update_center_sparse( - int i, + int cluster_idx, floating[::1] X_data, # IN int[::1] X_indices, # IN int[::1] X_indptr, # IN @@ -191,38 +191,38 @@ cdef void update_center_sparse( int n_features = centers_old.shape[1] floating alpha, tmp int n_indices - int j, k, idx + int k, sample_idx, feature_idx floating wsum = 0 # indices = np.where(labels == i)[0] k = 0 - for j in range(n_samples): - if labels[j] == i: - indices[k] = j - wsum += sample_weight[j] + for sample_idx in range(n_samples): + if labels[sample_idx] == cluster_idx: + indices[k] = sample_idx + wsum += sample_weight[sample_idx] k += 1 n_indices = k if wsum > 0: # Undo the previous count-based scaling for this cluster center: - for k in range(n_features): - centers_new[i, k] = centers_old[i, k] * weight_sums[i] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx] # Update cluster with new point members - for j in range(n_indices): - idx = indices[j] - for k in range(X_indptr[idx], X_indptr[idx + 1]): - centers_new[i, X_indices[k]] += X_data[k] * sample_weight[idx] + for k in range(n_indices): + sample_idx = indices[k] + for feature_idx in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]): + centers_new[cluster_idx, X_indices[feature_idx]] += X_data[feature_idx] * sample_weight[sample_idx] # Update the count statistics for this center - weight_sums[i] += wsum + weight_sums[cluster_idx] += wsum # Rescale to compute mean of all points (old and new) - alpha = 1 / weight_sums[i] - for k in range(n_features): - centers_new[i, k] *= alpha + alpha = 1 / weight_sums[cluster_idx] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] *= alpha else: # No sample was assigned to this cluster in this batch of data - for k in range(n_features): - centers_new[i, k] = centers_old[i, k] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] From d354434753c9f1cc5037eeab0902df997c216bae Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 31 Jul 2020 16:34:54 +0200 Subject: [PATCH 55/72] wip --- sklearn/cluster/_kmeans.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 1ed631bc8e9ba..fa38f1762471c 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1431,8 +1431,8 @@ class MiniBatchKMeans(KMeans): ... batch_size=6, ... max_iter=10).fit(X) >>> kmeans.cluster_centers_ - array([[2.3..., 1.1...], - [3.4..., 4.3...]]) + array([[1.19..., 1.22...], + [4.03..., 2.46...]]) >>> kmeans.predict([[0, 0], [4, 4]]) array([0, 1], dtype=int32) """ From c6a0456bd0c0bd31693ae370bb05ed05c05460bc Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 31 Jul 2020 17:43:56 +0200 Subject: [PATCH 56/72] avoid calling openmp_effective_n_threads again --- sklearn/cluster/_kmeans.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index fa38f1762471c..fb1020f59a851 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -537,7 +537,7 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300, def _labels_inertia(X, sample_weight, x_squared_norms, centers, - n_threads=None): + n_threads=1): """E step of the K-means EM algorithm. Compute the labels and the inertia of the given samples and centers. @@ -558,7 +558,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, centers : ndarray of shape (n_clusters, n_features) The cluster centers. - n_threads : int, default=None + n_threads : int, default=1 The number of OpenMP threads to use for the computation. Parallelism is sample-wise on the main cython loop which assigns each sample to its closest center. @@ -574,8 +574,6 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_samples = X.shape[0] n_clusters = centers.shape[0] - n_threads = _openmp_effective_n_threads(n_threads) - labels = np.full(n_samples, -1, dtype=np.int32) weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype) center_shift = np.zeros_like(weight_in_clusters) @@ -597,7 +595,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, def _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms, - centers, n_threads=None): + centers, n_threads=1): """Same as _labels_inertia but in a threadpool_limits context.""" with threadpool_limits(limits=1, user_api="blas"): labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms, From 9c9303738ebc23743d2b34181bb91f0b372e7288 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 4 Aug 2020 18:03:30 +0200 Subject: [PATCH 57/72] cln --- sklearn/cluster/tests/test_k_means.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index ad2d592667b05..6fc2f05d20071 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -793,7 +793,6 @@ def test_unit_weights_vs_no_weights(Estimator, data): def test_scaled_weights(Estimator, data): # Check that scaling all sample weights by a common factor # shouldn't change the result - data = np.random.random_sample((100000, 10)) sample_weight = np.random.RandomState(0).uniform(n_samples) km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1) From d263d308e67721ea606661c9e3149bfab17d5e23 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 23 Oct 2020 17:08:35 +0200 Subject: [PATCH 58/72] fix merging mistake --- sklearn/cluster/tests/test_k_means.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index f4b2497bb0bcd..6fc2f05d20071 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -551,6 +551,7 @@ def test_predict(Estimator, algorithm, init, dtype, array_constr): # re-predict labels for training set using fit_predict pred = km.fit_predict(X) + assert_allclose(v_measure_score(pred, labels), 1) # predict centroid labels pred = km.predict(km.cluster_centers_) From b14492aafe4e82f7ea789dcc9e0ca080f7e6f9ed Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Tue, 3 Nov 2020 15:10:27 +0100 Subject: [PATCH 59/72] merge master --- sklearn/cluster/_k_means_elkan.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx index eaa37fc513291..4fa9f61d54646 100644 --- a/sklearn/cluster/_k_means_elkan.pyx +++ b/sklearn/cluster/_k_means_elkan.pyx @@ -18,7 +18,7 @@ from libc.stdlib cimport calloc, free from libc.string cimport memset, memcpy from ..utils.extmath import row_norms -from ._k_means_fast import CHUNK_SIZE +from ._k_means_common import CHUNK_SIZE from ._k_means_common cimport _relocate_empty_clusters_dense from ._k_means_common cimport _relocate_empty_clusters_sparse from ._k_means_common cimport _euclidean_dense_dense From a3e1b11e80c58b3c5ea51ce8ff20268af7e3fda3 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Tue, 3 Nov 2020 15:21:09 +0100 Subject: [PATCH 60/72] change batch_size default --- sklearn/cluster/_kmeans.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index d980c839cafdc..504dbacfbf231 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1342,8 +1342,12 @@ class MiniBatchKMeans(KMeans): Maximum number of iterations over the complete dataset before stopping independently of any early stopping criterion heuristics. - batch_size : int, default=100 + batch_size : int, default=1024 Size of the mini batches. + For faster compuations, you can set the ``batch_size`` greater than + 256 * number of cores to enable parallelism on all cores. + + .. versionchanged:: XXX verbose : int, default=0 Verbosity mode. From 10695c65a781f13a85a521c345dbde721b81c936 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Tue, 3 Nov 2020 17:24:00 +0100 Subject: [PATCH 61/72] actually change batch size --- sklearn/cluster/_kmeans.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 504dbacfbf231..a7fc462851310 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1472,7 +1472,7 @@ class MiniBatchKMeans(KMeans): """ @_deprecate_positional_args def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, - batch_size=100, verbose=0, compute_labels=True, + batch_size=1024, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01, mode=0): @@ -1518,6 +1518,7 @@ def _check_params(self, X): if self.batch_size <= 0: raise ValueError( f"batch_size should be > 0, got {self.batch_size} instead.") + self._batch_size = min(self.batch_size, X.shape[0]) # init_size if self.init_size is not None and self.init_size <= 0: @@ -1525,7 +1526,7 @@ def _check_params(self, X): f"init_size should be > 0, got {self.init_size} instead.") self._init_size = self.init_size if self._init_size is None: - self._init_size = 3 * self.batch_size + self._init_size = 3 * self._batch_size if self._init_size < self.n_clusters: self._init_size = 3 * self.n_clusters elif self._init_size < self.n_clusters: @@ -1548,7 +1549,7 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, """Helper function to encapsulate the early stopping logic""" # Normalize inertia to be able to compare values when # batch_size changes - batch_inertia /= self.batch_size + batch_inertia /= self._batch_size # Ignore first iteration because it's inertia from initialization. if iteration_idx == 0: @@ -1564,7 +1565,7 @@ def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, if ewa_inertia is None: ewa_inertia = batch_inertia else: - alpha = self.batch_size * 2.0 / (n_samples + 1) + alpha = self._batch_size * 2.0 / (n_samples + 1) alpha = min(alpha, 1) ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha @@ -1614,7 +1615,7 @@ def _random_reassign(self): If there are empty clusters we always want to reassign. """ - self._n_since_last_reassign += self.batch_size + self._n_since_last_reassign += self._batch_size if ((self._counts == 0).any() or self._n_since_last_reassign >= (10 * self.n_clusters)): self._n_since_last_reassign = 0 @@ -1661,7 +1662,7 @@ def fit(self, X, y=None, sample_weight=None): init = check_array(init, dtype=X.dtype, copy=True, order='C') self._validate_center_shape(X, init) - self._check_mkl_vcomp(X, self.batch_size) + self._check_mkl_vcomp(X, self._batch_size) # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) @@ -1711,7 +1712,7 @@ def fit(self, X, y=None, sample_weight=None): # Initialize number of samples seen since last reassignment self._n_since_last_reassign = 0 - n_batches = int(np.ceil(float(n_samples) / self.batch_size)) + n_batches = int(np.ceil(float(n_samples) / self._batch_size)) n_iter = int(self.max_iter * n_batches) with threadpool_limits(limits=1, user_api="blas"): @@ -1719,7 +1720,7 @@ def fit(self, X, y=None, sample_weight=None): for i in range(n_iter): # Sample a minibatch from the full dataset minibatch_indices = random_state.randint(0, n_samples, - self.batch_size) + self._batch_size) # Perform the actual update step on the minibatch data batch_inertia = _mini_batch_step( From e4e15f5c9727d7316914a35a375d7be0156e8080 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 13 Nov 2020 12:53:04 +0100 Subject: [PATCH 62/72] reassignment_ratio docstring --- sklearn/cluster/_kmeans.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 61798c71b4638..48d5c9328f9fc 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1492,11 +1492,12 @@ class MiniBatchKMeans(KMeans): best of the ``n_init`` initializations as measured by inertia. reassignment_ratio : float, default=0.01 - Control the fraction of the maximum number of counts for a - center to be reassigned. A higher value means that low count - centers are more easily reassigned, which means that the - model will take longer to converge, but should converge in a - better clustering. + Control the fraction of the maximum number of counts for a center to + be reassigned. A higher value means that low count centers are more + easily reassigned, which means that the model will take longer to + converge, but should converge in a better clustering. A too high value + may however cause convergence issues, especially with a small batch + size. Attributes ---------- From 5f4f065dc7765ef83396ced797c0af253462a915 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 28 Jan 2021 16:01:58 +0100 Subject: [PATCH 63/72] cln --- sklearn/cluster/_kmeans.py | 89 ++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 487e67c8d35b1..6ea0b5fd6b421 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1440,7 +1440,8 @@ class MiniBatchKMeans(KMeans): For faster compuations, you can set the ``batch_size`` greater than 256 * number of cores to enable parallelism on all cores. - .. versionchanged:: XXX + .. versionchanged:: 1.0 + `batch_size` default changed from 100 to 1024. verbose : int, default=0 Verbosity mode. @@ -1510,7 +1511,12 @@ class MiniBatchKMeans(KMeans): center, weighted by the sample weights if provided. n_iter_ : int - Number of batches processed. + Number of iterations over the full dataset. + + n_steps_ : int + Number of minibatches processed. + + .. versionadded:: 1.0 counts_ : ndarray of shape (n_clusters,) Weigth sum of each cluster. @@ -1570,7 +1576,7 @@ class MiniBatchKMeans(KMeans): def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, batch_size=1024, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, - init_size=None, n_init=3, reassignment_ratio=0.01, mode=0): + init_size=None, n_init=3, reassignment_ratio=0.01): super().__init__( n_clusters=n_clusters, init=init, max_iter=max_iter, @@ -1581,7 +1587,6 @@ def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, self.compute_labels = compute_labels self.init_size = init_size self.reassignment_ratio = reassignment_ratio - self.mode = mode @deprecated("The attribute 'counts_' is deprecated in 0.24" # type: ignore " and will be removed in 1.1 (renaming of 0.26).") @@ -1640,67 +1645,63 @@ def _check_params(self, X): f"reassignment_ratio should be >= 0, got " f"{self.reassignment_ratio} instead.") - def _mini_batch_convergence(self, iteration_idx, n_iter, n_samples, + def _mini_batch_convergence(self, step, n_steps, n_samples, centers_squared_diff, batch_inertia): """Helper function to encapsulate the early stopping logic""" # Normalize inertia to be able to compare values when # batch_size changes batch_inertia /= self._batch_size + # count steps starting from 1 for user friendly verbose mode. + step = step + 1 + # Ignore first iteration because it's inertia from initialization. - if iteration_idx == 0: + if step == 1: if self.verbose: - print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: " - f"mean batch inertia: {batch_inertia}") + print(f"Minibatch step {step}/{n_steps}: mean batch " + f"inertia: {batch_inertia}") return False # Compute an Exponentially Weighted Average of the inertia to # monitor the convergence while discarding minibatch-local stochastic # variability: https://en.wikipedia.org/wiki/Moving_average - ewa_inertia = self._ewa_inertia - if ewa_inertia is None: - ewa_inertia = batch_inertia + if self._ewa_inertia is None: + self._ewa_inertia = batch_inertia else: alpha = self._batch_size * 2.0 / (n_samples + 1) alpha = min(alpha, 1) - ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha + self._ewa_inertia = ( + self._ewa_inertia * (1 - alpha) + batch_inertia * alpha) # Log progress to be able to monitor convergence if self.verbose: - print(f"Minibatch iteration {iteration_idx + 1}/{n_iter}: " - f"mean batch inertia: {batch_inertia}, ewa inertia: " - f"{ewa_inertia}") + print(f"Minibatch step {step}/{n_steps}: mean batch inertia: " + f"{batch_inertia}, ewa inertia: {self._ewa_inertia}") # Early stopping based on absolute tolerance on squared change of - # centers position (using EWA smoothing) + # centers position if self._tol > 0.0 and centers_squared_diff <= self._tol: if self.verbose: - print(f"Converged (small centers change) at iteration " - f"{iteration_idx + 1}/{n_iter}") + print(f"Converged (small centers change) at step " + f"{step}/{n_steps}") return True # Early stopping heuristic due to lack of improvement on smoothed # inertia - ewa_inertia_min = self._ewa_inertia_min - no_improvement = self._no_improvement - if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min: - no_improvement = 0 - ewa_inertia_min = ewa_inertia + if (self._ewa_inertia_min is None or + self._ewa_inertia < self._ewa_inertia_min): + self._no_improvement = 0 + self._ewa_inertia_min = self._ewa_inertia else: - no_improvement += 1 + self._no_improvement += 1 if (self.max_no_improvement is not None - and no_improvement >= self.max_no_improvement): + and self._no_improvement >= self.max_no_improvement): if self.verbose: - print(f"Converged (lack of improvement in inertia) at " - f"iteration {iteration_idx}/{n_iter}") + print(f"Converged (lack of improvement in inertia) at step " + f"{step}/{n_steps}") return True - # update the convergence context to maintain state across successive - # calls: - self._ewa_inertia = ewa_inertia - self._ewa_inertia_min = ewa_inertia_min - self._no_improvement = no_improvement return False def _random_reassign(self): @@ -1770,7 +1771,7 @@ def fit(self, X, y=None, sample_weight=None): sample_weight_valid = sample_weight[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] - # perform several inits with random sub-sets + # perform several inits with random subsets best_inertia = None for init_idx in range(self._n_init): if self.verbose: @@ -1808,12 +1809,12 @@ def fit(self, X, y=None, sample_weight=None): # Initialize number of samples seen since last reassignment self._n_since_last_reassign = 0 - n_batches = int(np.ceil(float(n_samples) / self._batch_size)) - n_iter = int(self.max_iter * n_batches) + n_steps_per_epoch = int(np.ceil(n_samples / self._batch_size)) + n_steps = self.max_iter * n_steps_per_epoch with threadpool_limits(limits=1, user_api="blas"): # Perform the iterative optimization until convergence - for i in range(n_iter): + for i in range(n_steps): # Sample a minibatch from the full dataset minibatch_indices = random_state.randint(0, n_samples, self._batch_size) @@ -1841,13 +1842,14 @@ def fit(self, X, y=None, sample_weight=None): # Monitor convergence and do early stopping if necessary if self._mini_batch_convergence( - i, n_iter, n_samples, centers_squared_diff, + i, n_steps, n_samples, centers_squared_diff, batch_inertia): break self.cluster_centers_ = centers - self.n_iter_ = i + 1 + self.n_steps_ = i + 1 + self.n_iter_ = (i + 1) // n_steps_per_epoch if self.compute_labels: self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( @@ -1876,22 +1878,23 @@ def partial_fit(self, X, y=None, sample_weight=None): ------- self """ - is_first_call_to_partial_fit = not hasattr(self, 'cluster_centers_') + has_centers = hasattr(self, 'cluster_centers_') X = self._validate_data(X, accept_sparse='csr', dtype=[np.float64, np.float32], order='C', accept_large_sparse=False, - reset=is_first_call_to_partial_fit) + reset=not has_centers) self._random_state = getattr(self, "_random_state", check_random_state(self.random_state)) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + self.n_steps_ = getattr(self, "n_steps_", 0) # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) - if is_first_call_to_partial_fit: - # this is the first call to partial_fit on this object + if not has_centers: + # this instance has not been fitted yet (fit or partial_fit) self._check_params(X) # Validate init array @@ -1931,6 +1934,8 @@ def partial_fit(self, X, y=None, sample_weight=None): X, sample_weight, x_squared_norms, self.cluster_centers_, n_threads=self._n_threads) + self.n_steps_ += 1 + return self def predict(self, X, sample_weight=None): From ab4310a579e6ce7895f900e903037f684f35e1e2 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 28 Jan 2021 18:03:51 +0100 Subject: [PATCH 64/72] make n_iter_ count number of started epochs --- sklearn/cluster/_kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 6ea0b5fd6b421..59a4fc2afd529 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1849,7 +1849,7 @@ def fit(self, X, y=None, sample_weight=None): self.cluster_centers_ = centers self.n_steps_ = i + 1 - self.n_iter_ = (i + 1) // n_steps_per_epoch + self.n_iter_ = np.ceil((i + 1) / n_steps_per_epoch) if self.compute_labels: self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( From 5aafed3666da087f84a61ecac6355448f8da51bd Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 5 Feb 2021 12:34:25 +0100 Subject: [PATCH 65/72] improve tests and docs --- sklearn/cluster/_kmeans.py | 7 +++--- sklearn/cluster/tests/test_k_means.py | 31 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 59a4fc2afd529..79d222036394b 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1352,6 +1352,8 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new, ------- inertia : float Sum of squared distances of samples to their closest cluster center. + The inertia is computed after finding the labels and before updating + the centers. """ # Perform label assignment to nearest centers labels, inertia = _labels_inertia(X, sample_weight, @@ -1809,8 +1811,7 @@ def fit(self, X, y=None, sample_weight=None): # Initialize number of samples seen since last reassignment self._n_since_last_reassign = 0 - n_steps_per_epoch = int(np.ceil(n_samples / self._batch_size)) - n_steps = self.max_iter * n_steps_per_epoch + n_steps = (self.max_iter * n_samples) // self._batch_size with threadpool_limits(limits=1, user_api="blas"): # Perform the iterative optimization until convergence @@ -1849,7 +1850,7 @@ def fit(self, X, y=None, sample_weight=None): self.cluster_centers_ = centers self.n_steps_ = i + 1 - self.n_iter_ = np.ceil((i + 1) / n_steps_per_epoch) + self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples)) if self.compute_labels: self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index a8b462cb825a9..c0bf79efc85ec 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -467,6 +467,37 @@ def test_minibatch_kmeans_init_size(): assert km._init_size == n_samples +@pytest.mark.parametrize("tol, max_no_improvement", [(1e-4, None), (0, 10)]) +def test_minibatch_declared_convergence(tol, max_no_improvement): + # Check that convergence based on small center change is achievable. + X, _, centers = make_blobs(centers=3, random_state=0, return_centers=True) + + km = MiniBatchKMeans(n_clusters=3, init=centers, batch_size=20, tol=tol, + random_state=0, max_iter=10, + max_no_improvement=max_no_improvement) + + km.fit(X) + assert 1 < km.n_iter_ < 10 + + +def test_minibatch_iter_steps(): + # Check consistency of n_iter_ and n_steps_ attributes. + batch_size = 30 + n_samples = X.shape[0] + km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size, + random_state=0).fit(X) + + # n_iter_ is the number of started epochs + assert km.n_iter_ == np.ceil((km.n_steps_ * batch_size) / n_samples) + + # without stopping condition, max_iter should be reached + km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size, random_state=0, + tol=0, max_no_improvement=None, max_iter=10).fit(X) + + assert km.n_iter_ == 10 + assert km.n_steps_ == (10 * n_samples) // batch_size + + def test_kmeans_copyx(): # Check that copy_x=False returns nearly equal X after de-centering. my_X = X.copy() From 0a71a92eef2e58a72c7405f8d7dee3357b8781d3 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 12 Mar 2021 15:21:43 +0100 Subject: [PATCH 66/72] don't move kmpp --- sklearn/cluster/_kmeans.py | 187 +++++++++++++++++++------------------ 1 file changed, 94 insertions(+), 93 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 79d222036394b..c77fd14f3faaa 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -47,99 +47,6 @@ ############################################################################### # Initialization heuristic -def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None, - random_state=None, n_local_trials=None): - """Init n_clusters seeds according to k-means++ - - .. versionadded:: 0.24 - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The data to pick seeds from. - - n_clusters : int - The number of centroids to initialize - - x_squared_norms : array-like of shape (n_samples,), default=None - Squared Euclidean norm of each data point. - - random_state : int or RandomState instance, default=None - Determines random number generation for centroid initialization. Pass - an int for reproducible output across multiple function calls. - See :term:`Glossary `. - - n_local_trials : int, default=None - The number of seeding trials for each center (except the first), - of which the one reducing inertia the most is greedily chosen. - Set to None to make the number of trials depend logarithmically - on the number of seeds (2+log(k)). - - Returns - ------- - centers : ndarray of shape (n_clusters, n_features) - The inital centers for k-means. - - indices : ndarray of shape (n_clusters,) - The index location of the chosen centers in the data array X. For a - given index and center, X[index] = center. - - Notes - ----- - Selects initial cluster centers for k-mean clustering in a smart way - to speed up convergence. see: Arthur, D. and Vassilvitskii, S. - "k-means++: the advantages of careful seeding". ACM-SIAM symposium - on Discrete algorithms. 2007 - - Examples - -------- - - >>> from sklearn.cluster import kmeans_plusplus - >>> import numpy as np - >>> X = np.array([[1, 2], [1, 4], [1, 0], - ... [10, 2], [10, 4], [10, 0]]) - >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0) - >>> centers - array([[10, 4], - [ 1, 0]]) - >>> indices - array([4, 2]) - """ - - # Check data - check_array(X, accept_sparse='csr', - dtype=[np.float64, np.float32]) - - if X.shape[0] < n_clusters: - raise ValueError(f"n_samples={X.shape[0]} should be >= " - f"n_clusters={n_clusters}.") - - # Check parameters - if x_squared_norms is None: - x_squared_norms = row_norms(X, squared=True) - else: - x_squared_norms = check_array(x_squared_norms, - dtype=X.dtype, - ensure_2d=False) - - if x_squared_norms.shape[0] != X.shape[0]: - raise ValueError( - f"The length of x_squared_norms {x_squared_norms.shape[0]} should " - f"be equal to the length of n_samples {X.shape[0]}.") - - if n_local_trials is not None and n_local_trials < 1: - raise ValueError( - f"n_local_trials is set to {n_local_trials} but should be an " - f"integer value greater than zero.") - - random_state = check_random_state(random_state) - - # Call private k-means++ - centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms, - random_state, n_local_trials) - - return centers, indices - def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): @@ -1979,3 +1886,97 @@ def _more_tags(self): 'zero sample_weight is not equivalent to removing samples', } } + + +def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None, + random_state=None, n_local_trials=None): + """Init n_clusters seeds according to k-means++ + + .. versionadded:: 0.24 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to pick seeds from. + + n_clusters : int + The number of centroids to initialize + + x_squared_norms : array-like of shape (n_samples,), default=None + Squared Euclidean norm of each data point. + + random_state : int or RandomState instance, default=None + Determines random number generation for centroid initialization. Pass + an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + n_local_trials : int, default=None + The number of seeding trials for each center (except the first), + of which the one reducing inertia the most is greedily chosen. + Set to None to make the number of trials depend logarithmically + on the number of seeds (2+log(k)). + + Returns + ------- + centers : ndarray of shape (n_clusters, n_features) + The inital centers for k-means. + + indices : ndarray of shape (n_clusters,) + The index location of the chosen centers in the data array X. For a + given index and center, X[index] = center. + + Notes + ----- + Selects initial cluster centers for k-mean clustering in a smart way + to speed up convergence. see: Arthur, D. and Vassilvitskii, S. + "k-means++: the advantages of careful seeding". ACM-SIAM symposium + on Discrete algorithms. 2007 + + Examples + -------- + + >>> from sklearn.cluster import kmeans_plusplus + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [10, 2], [10, 4], [10, 0]]) + >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0) + >>> centers + array([[10, 4], + [ 1, 0]]) + >>> indices + array([4, 2]) + """ + + # Check data + check_array(X, accept_sparse='csr', + dtype=[np.float64, np.float32]) + + if X.shape[0] < n_clusters: + raise ValueError(f"n_samples={X.shape[0]} should be >= " + f"n_clusters={n_clusters}.") + + # Check parameters + if x_squared_norms is None: + x_squared_norms = row_norms(X, squared=True) + else: + x_squared_norms = check_array(x_squared_norms, + dtype=X.dtype, + ensure_2d=False) + + if x_squared_norms.shape[0] != X.shape[0]: + raise ValueError( + f"The length of x_squared_norms {x_squared_norms.shape[0]} should " + f"be equal to the length of n_samples {X.shape[0]}.") + + if n_local_trials is not None and n_local_trials < 1: + raise ValueError( + f"n_local_trials is set to {n_local_trials} but should be an " + f"integer value greater than zero.") + + random_state = check_random_state(random_state) + + # Call private k-means++ + centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms, + random_state, n_local_trials) + + return centers, indices From eacb6cc582b99949d5214b0ff0e3c525c5bbb8ac Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 12 Mar 2021 17:40:36 +0100 Subject: [PATCH 67/72] address comments --- sklearn/cluster/_k_means_minibatch.pyx | 8 ++++---- sklearn/cluster/_kmeans.py | 10 +++++++--- sklearn/cluster/tests/test_k_means.py | 15 ++++++++++++--- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index 942c058b07a92..1f52625279aef 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -74,7 +74,7 @@ cdef void update_center_dense( floating[:, ::1] centers_new, # OUT floating[::1] weight_sums, # INOUT int[::1] labels, # IN - int *indices) nogil: # OUT + int *indices) nogil: # TMP """Update of a single center for dense MinibatchKMeans""" cdef: int n_samples = sample_weight.shape[0] @@ -85,7 +85,7 @@ cdef void update_center_dense( floating wsum = 0 - # indices = np.where(labels == i)[0] + # indices = np.where(labels == cluster_idx)[0] k = 0 for sample_idx in range(n_samples): if labels[sample_idx] == cluster_idx: @@ -184,7 +184,7 @@ cdef void update_center_sparse( floating[:, ::1] centers_new, # OUT floating[::1] weight_sums, # INOUT int[::1] labels, # IN - int *indices) nogil: # OUT + int *indices) nogil: # TMP """Update of a single center for sparse MinibatchKMeans""" cdef: int n_samples = sample_weight.shape[0] @@ -195,7 +195,7 @@ cdef void update_center_sparse( floating wsum = 0 - # indices = np.where(labels == i)[0] + # indices = np.where(labels == cluster_idx)[0] k = 0 for sample_idx in range(n_samples): if labels[sample_idx] == cluster_idx: diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index c77fd14f3faaa..7df9c9a2d33b8 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1415,9 +1415,11 @@ class MiniBatchKMeans(KMeans): inertia_ : float The value of the inertia criterion associated with the chosen - partition (if compute_labels is set to True). The inertia is - defined as the sum of square distances of samples to their cluster - center, weighted by the sample weights if provided. + partition if compute_labels is set to True. If compute_labels is set to + False, it's an approximation of the inertia based on an exponentially + weighted average of the batch inertiae. + The inertia is defined as the sum of square distances of samples to + their cluster center, weighted by the sample weights if provided. n_iter_ : int Number of iterations over the full dataset. @@ -1763,6 +1765,8 @@ def fit(self, X, y=None, sample_weight=None): self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( X, sample_weight, x_squared_norms, self.cluster_centers_, n_threads=self._n_threads) + else: + self.inertia_ = self._ewa_inertia * n_samples return self diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index c0bf79efc85ec..fccadd68e821c 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -468,17 +468,24 @@ def test_minibatch_kmeans_init_size(): @pytest.mark.parametrize("tol, max_no_improvement", [(1e-4, None), (0, 10)]) -def test_minibatch_declared_convergence(tol, max_no_improvement): - # Check that convergence based on small center change is achievable. +def test_minibatch_declared_convergence(capsys, tol, max_no_improvement): + # Check convergence detection based on ewa batch inertia or on + # small center change. X, _, centers = make_blobs(centers=3, random_state=0, return_centers=True) km = MiniBatchKMeans(n_clusters=3, init=centers, batch_size=20, tol=tol, - random_state=0, max_iter=10, + random_state=0, max_iter=10, verbose=1, max_no_improvement=max_no_improvement) km.fit(X) assert 1 < km.n_iter_ < 10 + captured = capsys.readouterr() + if max_no_improvement is None: + assert "Converged (small centers change)" in captured.out + if tol == 0: + assert "Converged (lack of improvement in inertia)" in captured.out + def test_minibatch_iter_steps(): # Check consistency of n_iter_ and n_steps_ attributes. @@ -489,6 +496,7 @@ def test_minibatch_iter_steps(): # n_iter_ is the number of started epochs assert km.n_iter_ == np.ceil((km.n_steps_ * batch_size) / n_samples) + assert isinstance(km.n_iter_, int) # without stopping condition, max_iter should be reached km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size, random_state=0, @@ -496,6 +504,7 @@ def test_minibatch_iter_steps(): assert km.n_iter_ == 10 assert km.n_steps_ == (10 * n_samples) // batch_size + assert isinstance(km.n_steps_, int) def test_kmeans_copyx(): From c71b5b5beaeac716028339516ac9de7d5386cb86 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 12 Mar 2021 18:10:44 +0100 Subject: [PATCH 68/72] add what's new entry --- doc/whats_new/v1.0.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index a00523ec2223b..a1c68d5132e6a 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -56,9 +56,27 @@ Changelog in multicore settings. :pr:`19052` by :user:`Yusuke Nagasaka `. +- |Efficiency| :class:`cluster.MiniBatchKMeans` is now faster in multicore + settings. :pr:`17622` by :user:`Jérémie du Boisberranger `. + - |Fix| Fixes incorrect multiple data-conversion warnings when clustering boolean data. :pr:`19046` by :user:`Surya Prakash `. +- |Fix| Fixed a bug in :class:`cluster.MiniBatchKMeans` where the sample + weights were partially ignored when the input is sparse. :pr:`17622` by + :user:`Jérémie du Boisberranger `. + +- |Fix| Improved convergence detection based on center change in + :class:`cluster.MiniBatchKMeans` which was almost never achievable. + :pr:`17622` by :user:`Jérémie du Boisberranger `. + +- |API| the default value for the `batch_size` parameter of + :class:`MiniBatchKMeans` was changed from 100 to 1024 due to efficiency + reasons. The `n_iter_` attribute of :class:`MiniBatchKMeans` now reports the + number of started epochs and the `n_steps_` attribute reports the number of + mini batches processed. :pr:`17622` + by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.ensemble` ....................... From 5d4e3d9faafc307e0ed58ddde50f480d6eb224a3 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 12 Mar 2021 18:13:20 +0100 Subject: [PATCH 69/72] remove warning in test --- sklearn/cluster/tests/test_k_means.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index fccadd68e821c..3cb9e395ab743 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -474,7 +474,7 @@ def test_minibatch_declared_convergence(capsys, tol, max_no_improvement): X, _, centers = make_blobs(centers=3, random_state=0, return_centers=True) km = MiniBatchKMeans(n_clusters=3, init=centers, batch_size=20, tol=tol, - random_state=0, max_iter=10, verbose=1, + random_state=0, max_iter=10, n_init=1, verbose=1, max_no_improvement=max_no_improvement) km.fit(X) @@ -485,7 +485,7 @@ def test_minibatch_declared_convergence(capsys, tol, max_no_improvement): assert "Converged (small centers change)" in captured.out if tol == 0: assert "Converged (lack of improvement in inertia)" in captured.out - + def test_minibatch_iter_steps(): # Check consistency of n_iter_ and n_steps_ attributes. From ebefe18b1d74353e0ee10ae7d7edf777bcde6430 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 12 Mar 2021 18:20:49 +0100 Subject: [PATCH 70/72] lint --- sklearn/cluster/tests/test_k_means.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 3cb9e395ab743..a56c2d8e55d8e 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -485,7 +485,7 @@ def test_minibatch_declared_convergence(capsys, tol, max_no_improvement): assert "Converged (small centers change)" in captured.out if tol == 0: assert "Converged (lack of improvement in inertia)" in captured.out - + def test_minibatch_iter_steps(): # Check consistency of n_iter_ and n_steps_ attributes. From be0c9487141901d895fcb3093016a2918555990a Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 7 Apr 2021 19:09:30 +0200 Subject: [PATCH 71/72] adress comments --- sklearn/cluster/_k_means_minibatch.pyx | 6 +++--- sklearn/cluster/_kmeans.py | 17 +++++++++++------ sklearn/cluster/tests/test_k_means.py | 4 ++-- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx index 1f52625279aef..ab5aee35ea075 100644 --- a/sklearn/cluster/_k_means_minibatch.pyx +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -2,7 +2,7 @@ # TODO: We still need to use ndarrays instead of typed memoryviews when using # fused types and when the array may be read-only (for instance when it's -# provided by the user). This is fixed in cython > 0.3. +# provided by the user). This will be fixed in cython >= 0.3. cimport numpy as np from cython cimport floating @@ -79,7 +79,7 @@ cdef void update_center_dense( cdef: int n_samples = sample_weight.shape[0] int n_features = centers_old.shape[1] - floating alpha, tmp + floating alpha int n_indices int k, sample_idx, feature_idx @@ -189,7 +189,7 @@ cdef void update_center_sparse( cdef: int n_samples = sample_weight.shape[0] int n_features = centers_old.shape[1] - floating alpha, tmp + floating alpha int n_indices int k, sample_idx, feature_idx diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 1ccff8fe8e454..e9f952b58cb1b 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1309,7 +1309,7 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new, ---------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) - The original data array. In sparse, must be in CSR format. + The original data array. If sparse, must be in CSR format. x_squared_norms : ndarray of shape (n_samples,) Squared euclidean norm of each data point. @@ -1356,6 +1356,8 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new, the centers. """ # Perform label assignment to nearest centers + # For better efficiency, it's better to run _mini_batch_step in a + # threadpool_limit context then using _labels_inertia_threadpool_limit here labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=n_threads) @@ -1493,8 +1495,8 @@ class MiniBatchKMeans(KMeans): Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more easily reassigned, which means that the model will take longer to - converge, but should converge in a better clustering. A too high value - may however cause convergence issues, especially with a small batch + converge, but should converge in a better clustering. However, too high + a value may cause convergence issues, especially with a small batch size. Attributes @@ -1503,7 +1505,7 @@ class MiniBatchKMeans(KMeans): cluster_centers_ : ndarray of shape (n_clusters, n_features) Coordinates of cluster centers. - labels_ : ndarray of shape (n_samples) + labels_ : ndarray of shape (n_samples,) Labels of each point (if compute_labels is set to True). inertia_ : float @@ -1869,8 +1871,11 @@ def partial_fit(self, X, y=None, sample_weight=None): Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) - Coordinates of the data points to cluster. It must be noted that - X will be copied if it is not C-contiguous. + Training instances to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory copy + if the given data is not C-contiguous. + If a sparse matrix is passed, a copy will be made if it's not in + CSR format. y : Ignored Not used, present here for API consistency by convention. diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index a56c2d8e55d8e..248b2e1ddd498 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -999,9 +999,9 @@ def test_inertia(dtype): expected = np.sum(distances * sample_weight) inertia_dense = _inertia_dense( - X_dense, sample_weight, centers, labels, 1) + X_dense, sample_weight, centers, labels, n_threads=1) inertia_sparse = _inertia_sparse( - X_sparse, sample_weight, centers, labels, 1) + X_sparse, sample_weight, centers, labels, n_threads=1) assert_allclose(inertia_dense, inertia_sparse, rtol=1e-6) assert_allclose(inertia_dense, expected, rtol=1e-6) From 5ff60c8024e2b2542047e725879b003702a5d7b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Wed, 7 Apr 2021 19:39:02 +0200 Subject: [PATCH 72/72] Update sklearn/cluster/_kmeans.py Co-authored-by: Julien Jerphanion --- sklearn/cluster/_kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index e9f952b58cb1b..44c2837a8802a 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1357,7 +1357,7 @@ def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new, """ # Perform label assignment to nearest centers # For better efficiency, it's better to run _mini_batch_step in a - # threadpool_limit context then using _labels_inertia_threadpool_limit here + # threadpool_limit context than using _labels_inertia_threadpool_limit here labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=n_threads)