scikit-learn
diff --git a/‎sklearn/cluster/k_means_.py
Lines changed: 146 additions & 131 deletions b/‎sklearn/cluster/k_means_.py
Lines changed: 146 additions & 131 deletions
diff --git a/‎sklearn/discriminant_analysis.py
Lines changed: 1 addition & 1 deletion b/‎sklearn/discriminant_analysis.py
Lines changed: 1 addition & 1 deletion
@@ -288,131 +288,17 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++',
         Returned only if `return_n_iter` is set to True.
 
     """
-    if n_init <= 0:
-        raise ValueError("Invalid number of initializations."
-                         " n_init=%d must be bigger than zero." % n_init)
-    random_state = check_random_state(random_state)
-
-    if max_iter <= 0:
-        raise ValueError('Number of iterations should be a positive number,'
-                         ' got %d instead' % max_iter)
-
-    # avoid forcing order when copy_x=False
-    order = "C" if copy_x else None
-    X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
-                    order=order, copy=copy_x)
-    # verify that the number of samples given is larger than k
-    if _num_samples(X) < n_clusters:
-        raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
-            _num_samples(X), n_clusters))
-
-    tol = _tolerance(X, tol)
-
-    # If the distances are precomputed every job will create a matrix of shape
-    # (n_clusters, n_samples). To stop KMeans from eating up memory we only
-    # activate this if the created matrix is guaranteed to be under 100MB. 12
-    # million entries consume a little under 100MB if they are of type double.
-    if precompute_distances == 'auto':
-        n_samples = X.shape[0]
-        precompute_distances = (n_clusters * n_samples) < 12e6
-    elif isinstance(precompute_distances, bool):
-        pass
-    else:
-        raise ValueError("precompute_distances should be 'auto' or True/False"
-                         ", but a value of %r was passed" %
-                         precompute_distances)
-
-    # Validate init array
-    if hasattr(init, '__array__'):
-        init = check_array(init, dtype=X.dtype.type, copy=True)
-        _validate_center_shape(X, n_clusters, init)
-
-        if n_init != 1:
-            warnings.warn(
-                'Explicit initial center position passed: '
-                'performing only one init in k-means instead of n_init=%d'
-                % n_init, RuntimeWarning, stacklevel=2)
-            n_init = 1
-
-    # subtract of mean of x for more accurate distance computations
-    if not sp.issparse(X):
-        X_mean = X.mean(axis=0)
-        # The copy was already done above
-        X -= X_mean
-
-        if hasattr(init, '__array__'):
-            init -= X_mean
-
-    # precompute squared norms of data points
-    x_squared_norms = row_norms(X, squared=True)
-
-    best_labels, best_inertia, best_centers = None, None, None
-    if n_clusters == 1:
-        # elkan doesn't make sense for a single cluster, full will produce
-        # the right result.
-        algorithm = "full"
-    if algorithm == "auto":
-        algorithm = "full" if sp.issparse(X) else 'elkan'
-    if algorithm == "full":
-        kmeans_single = _kmeans_single_lloyd
-    elif algorithm == "elkan":
-        kmeans_single = _kmeans_single_elkan
-    else:
-        raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
-                         " %s" % str(algorithm))
-
-    seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
-    if effective_n_jobs(n_jobs) == 1:
-        # For a single thread, less memory is needed if we just store one set
-        # of the best results (as opposed to one set per run per thread).
-        for seed in seeds:
-            # run a k-means once
-            labels, inertia, centers, n_iter_ = kmeans_single(
-                X, sample_weight, n_clusters, max_iter=max_iter, init=init,
-                verbose=verbose, precompute_distances=precompute_distances,
-                tol=tol, x_squared_norms=x_squared_norms,
-                random_state=seed)
-            # determine if these results are the best so far
-            if best_inertia is None or inertia < best_inertia:
-                best_labels = labels.copy()
-                best_centers = centers.copy()
-                best_inertia = inertia
-                best_n_iter = n_iter_
-    else:
-        # parallelisation of k-means runs
-        results = Parallel(n_jobs=n_jobs, verbose=0)(
-            delayed(kmeans_single)(X, sample_weight, n_clusters,
-                                   max_iter=max_iter, init=init,
-                                   verbose=verbose, tol=tol,
-                                   precompute_distances=precompute_distances,
-                                   x_squared_norms=x_squared_norms,
-                                   # Change seed to ensure variety
-                                   random_state=seed)
-            for seed in seeds)
-        # Get results with the lowest inertia
-        labels, inertia, centers, n_iters = zip(*results)
-        best = np.argmin(inertia)
-        best_labels = labels[best]
-        best_inertia = inertia[best]
-        best_centers = centers[best]
-        best_n_iter = n_iters[best]
-
-    if not sp.issparse(X):
-        if not copy_x:
-            X += X_mean
-        best_centers += X_mean
-
-    distinct_clusters = len(set(best_labels))
-    if distinct_clusters < n_clusters:
-        warnings.warn("Number of distinct clusters ({}) found smaller than "
-                      "n_clusters ({}). Possibly due to duplicate points "
-                      "in X.".format(distinct_clusters, n_clusters),
-                      ConvergenceWarning, stacklevel=2)
 
+    est = KMeans(
+        n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter,
+        verbose=verbose, precompute_distances=precompute_distances, tol=tol,
+        random_state=random_state, copy_x=copy_x, n_jobs=n_jobs,
+        algorithm=algorithm
+    ).fit(X, sample_weight=sample_weight)
     if return_n_iter:
-        return best_centers, best_labels, best_inertia, best_n_iter
+        return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_
     else:
-        return best_centers, best_labels, best_inertia
+        return est.cluster_centers_, est.labels_, est.inertia_
 
 
 def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
@@ -953,15 +839,144 @@ def fit(self, X, y=None, sample_weight=None):
         """
         random_state = check_random_state(self.random_state)
 
-        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
-            k_means(
-                X, n_clusters=self.n_clusters, sample_weight=sample_weight,
-                init=self.init, n_init=self.n_init,
-                max_iter=self.max_iter, verbose=self.verbose,
-                precompute_distances=self.precompute_distances,
-                tol=self.tol, random_state=random_state, copy_x=self.copy_x,
-                n_jobs=self.n_jobs, algorithm=self.algorithm,
-                return_n_iter=True)
+        n_init = self.n_init
+        if n_init <= 0:
+            raise ValueError("Invalid number of initializations."
+                             " n_init=%d must be bigger than zero." % n_init)
+
+        if self.max_iter <= 0:
+            raise ValueError(
+                'Number of iterations should be a positive number,'
+                ' got %d instead' % self.max_iter
+            )
+
+        # avoid forcing order when copy_x=False
+        order = "C" if self.copy_x else None
+        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
+                        order=order, copy=self.copy_x)
+        # verify that the number of samples given is larger than k
+        if _num_samples(X) < self.n_clusters:
+            raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
+                _num_samples(X), self.n_clusters))
+
+        tol = _tolerance(X, self.tol)
+
+        # If the distances are precomputed every job will create a matrix of
+        # shape (n_clusters, n_samples). To stop KMeans from eating up memory
+        # we only activate this if the created matrix is guaranteed to be
+        # under 100MB. 12 million entries consume a little under 100MB if they
+        # are of type double.
+        precompute_distances = self.precompute_distances
+        if precompute_distances == 'auto':
+            n_samples = X.shape[0]
+            precompute_distances = (self.n_clusters * n_samples) < 12e6
+        elif isinstance(precompute_distances, bool):
+            pass
+        else:
+            raise ValueError(
+                "precompute_distances should be 'auto' or True/False"
+                ", but a value of %r was passed" %
+                precompute_distances
+            )
+
+        # Validate init array
+        init = self.init
+        if hasattr(init, '__array__'):
+            init = check_array(init, dtype=X.dtype.type, copy=True)
+            _validate_center_shape(X, self.n_clusters, init)
+
+            if n_init != 1:
+                warnings.warn(
+                    'Explicit initial center position passed: '
+                    'performing only one init in k-means instead of n_init=%d'
+                    % n_init, RuntimeWarning, stacklevel=2)
+                n_init = 1
+
+        # subtract of mean of x for more accurate distance computations
+        if not sp.issparse(X):
+            X_mean = X.mean(axis=0)
+            # The copy was already done above
+            X -= X_mean
+
+            if hasattr(init, '__array__'):
+                init -= X_mean
+
+        # precompute squared norms of data points
+        x_squared_norms = row_norms(X, squared=True)
+
+        best_labels, best_inertia, best_centers = None, None, None
+        algorithm = self.algorithm
+        if self.n_clusters == 1:
+            # elkan doesn't make sense for a single cluster, full will produce
+            # the right result.
+            algorithm = "full"
+        if algorithm == "auto":
+            algorithm = "full" if sp.issparse(X) else 'elkan'
+        if algorithm == "full":
+            kmeans_single = _kmeans_single_lloyd
+        elif algorithm == "elkan":
+            kmeans_single = _kmeans_single_elkan
+        else:
+            raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
+                             " %s" % str(algorithm))
+
+        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
+        if effective_n_jobs(self.n_jobs) == 1:
+            # For a single thread, less memory is needed if we just store one
+            # set of the best results (as opposed to one set per run per
+            # thread).
+            for seed in seeds:
+                # run a k-means once
+                labels, inertia, centers, n_iter_ = kmeans_single(
+                    X, sample_weight, self.n_clusters,
+                    max_iter=self.max_iter, init=init, verbose=self.verbose,
+                    precompute_distances=precompute_distances, tol=tol,
+                    x_squared_norms=x_squared_norms, random_state=seed)
+                # determine if these results are the best so far
+                if best_inertia is None or inertia < best_inertia:
+                    best_labels = labels.copy()
+                    best_centers = centers.copy()
+                    best_inertia = inertia
+                    best_n_iter = n_iter_
+        else:
+            # parallelisation of k-means runs
+            results = Parallel(n_jobs=self.n_jobs, verbose=0)(
+                delayed(kmeans_single)(
+                    X, sample_weight, self.n_clusters,
+                    max_iter=self.max_iter, init=init,
+                    verbose=self.verbose, tol=tol,
+                    precompute_distances=precompute_distances,
+                    x_squared_norms=x_squared_norms,
+                    # Change seed to ensure variety
+                    random_state=seed
+                )
+                for seed in seeds)
+            # Get results with the lowest inertia
+            labels, inertia, centers, n_iters = zip(*results)
+            best = np.argmin(inertia)
+            best_labels = labels[best]
+            best_inertia = inertia[best]
+            best_centers = centers[best]
+            best_n_iter = n_iters[best]
+
+        if not sp.issparse(X):
+            if not self.copy_x:
+                X += X_mean
+            best_centers += X_mean
+
+        distinct_clusters = len(set(best_labels))
+        if distinct_clusters < self.n_clusters:
+            warnings.warn(
+                "Number of distinct clusters ({}) found smaller than "
+                "n_clusters ({}). Possibly due to duplicate points "
+                "in X.".format(distinct_clusters, self.n_clusters),
+                ConvergenceWarning, stacklevel=2
+            )
+
+        self.cluster_centers_ = best_centers
+        self.labels_ = best_labels
+        self.inertia_ = best_inertia
+        self.n_iter_ = best_n_iter
         return self
 
     def fit_predict(self, X, y=None, sample_weight=None):
 
@@ -186,7 +186,7 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
     coef_ : array, shape (n_features,) or (n_classes, n_features)
         Weight vector(s).
 
-    intercept_ : array, shape (n_features,)
+    intercept_ : array, shape (n_classes,)
         Intercept term.
 
     covariance_ : array-like, shape (n_features, n_features)