crankycoder
diff --git a/‎doc/whats_new/v0.22.rst
Lines changed: 5 additions & 0 deletions b/‎doc/whats_new/v0.22.rst
Lines changed: 5 additions & 0 deletions
diff --git a/‎sklearn/cluster/mean_shift_.py
Lines changed: 93 additions & 73 deletions b/‎sklearn/cluster/mean_shift_.py
Lines changed: 93 additions & 73 deletions
diff --git a/‎sklearn/cluster/tests/test_mean_shift.py
Lines changed: 13 additions & 0 deletions b/‎sklearn/cluster/tests/test_mean_shift.py
Lines changed: 13 additions & 0 deletions
@@ -89,6 +89,11 @@ Changelog
   producing Segmentation Fault on large arrays due to integer index overflow.
   :pr:`15057` by :user:`Vladimir Korolev <balodja>`.
 
+- |Fix| :class:`~cluster.MeanShift` now accepts a :term:`max_iter` with a
+  default value of 300 instead of always using the default 300. It also now
+  exposes an ``n_iter_`` indicating the maximum number of iterations performed
+  on each seed. :pr:`15120` by `Adrin Jalali`_.
+
 :mod:`sklearn.compose`
 ......................
 
 
@@ -101,8 +101,9 @@ def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):
         # If converged or at max_iter, adds the cluster
         if (np.linalg.norm(my_mean - my_old_mean) < stop_thresh or
                 completed_iterations == max_iter):
-            return tuple(my_mean), len(points_within)
+            break
         completed_iterations += 1
+    return tuple(my_mean), len(points_within), completed_iterations
 
 
 def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False,
@@ -178,72 +179,12 @@ def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False,
     <sphx_glr_auto_examples_cluster_plot_mean_shift.py>`.
 
     """
-
-    if bandwidth is None:
-        bandwidth = estimate_bandwidth(X, n_jobs=n_jobs)
-    elif bandwidth <= 0:
-        raise ValueError("bandwidth needs to be greater than zero or None,"
-                         " got %f" % bandwidth)
-    if seeds is None:
-        if bin_seeding:
-            seeds = get_bin_seeds(X, bandwidth, min_bin_freq)
-        else:
-            seeds = X
-    n_samples, n_features = X.shape
-    center_intensity_dict = {}
-
-    # We use n_jobs=1 because this will be used in nested calls under
-    # parallel calls to _mean_shift_single_seed so there is no need for
-    # for further parallelism.
-    nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
-
-    # execute iterations on all seeds in parallel
-    all_res = Parallel(n_jobs=n_jobs)(
-        delayed(_mean_shift_single_seed)
-        (seed, X, nbrs, max_iter) for seed in seeds)
-    # copy results in a dictionary
-    for i in range(len(seeds)):
-        if all_res[i] is not None:
-            center_intensity_dict[all_res[i][0]] = all_res[i][1]
-
-    if not center_intensity_dict:
-        # nothing near seeds
-        raise ValueError("No point was within bandwidth=%f of any seed."
-                         " Try a different seeding strategy \
-                         or increase the bandwidth."
-                         % bandwidth)
-
-    # POST PROCESSING: remove near duplicate points
-    # If the distance between two kernels is less than the bandwidth,
-    # then we have to remove one because it is a duplicate. Remove the
-    # one with fewer points.
-
-    sorted_by_intensity = sorted(center_intensity_dict.items(),
-                                 key=lambda tup: (tup[1], tup[0]),
-                                 reverse=True)
-    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
-    unique = np.ones(len(sorted_centers), dtype=np.bool)
-    nbrs = NearestNeighbors(radius=bandwidth,
-                            n_jobs=n_jobs).fit(sorted_centers)
-    for i, center in enumerate(sorted_centers):
-        if unique[i]:
-            neighbor_idxs = nbrs.radius_neighbors([center],
-                                                  return_distance=False)[0]
-            unique[neighbor_idxs] = 0
-            unique[i] = 1  # leave the current point as unique
-    cluster_centers = sorted_centers[unique]
-
-    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
-    nbrs = NearestNeighbors(n_neighbors=1, n_jobs=n_jobs).fit(cluster_centers)
-    labels = np.zeros(n_samples, dtype=np.int)
-    distances, idxs = nbrs.kneighbors(X)
-    if cluster_all:
-        labels = idxs.flatten()
-    else:
-        labels.fill(-1)
-        bool_selector = distances.flatten() <= bandwidth
-        labels[bool_selector] = idxs.flatten()[bool_selector]
-    return cluster_centers, labels
+    model = MeanShift(bandwidth=bandwidth, seeds=seeds,
+                      min_bin_freq=min_bin_freq,
+                      bin_seeding=bin_seeding,
+                      cluster_all=cluster_all, n_jobs=n_jobs,
+                      max_iter=max_iter).fit(X)
+    return model.cluster_centers_, model.labels_
 
 
 def get_bin_seeds(X, bin_size, min_bin_freq=1):
@@ -347,6 +288,12 @@ class MeanShift(ClusterMixin, BaseEstimator):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+    max_iter : int, default=300
+        Maximum number of iterations, per seed point before the clustering
+        operation terminates (for that seed point), if has not converged yet.
+
+        .. versionadded:: 0.22
+
     Attributes
     ----------
     cluster_centers_ : array, [n_clusters, n_features]
@@ -355,6 +302,11 @@ class MeanShift(ClusterMixin, BaseEstimator):
     labels_ :
         Labels of each point.
 
+    n_iter_ : int
+        Maximum number of iterations performed on each seed.
+
+        .. versionadded:: 0.22
+
     Examples
     --------
     >>> from sklearn.cluster import MeanShift
@@ -395,13 +347,14 @@ class MeanShift(ClusterMixin, BaseEstimator):
 
     """
     def __init__(self, bandwidth=None, seeds=None, bin_seeding=False,
-                 min_bin_freq=1, cluster_all=True, n_jobs=None):
+                 min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300):
         self.bandwidth = bandwidth
         self.seeds = seeds
         self.bin_seeding = bin_seeding
         self.cluster_all = cluster_all
         self.min_bin_freq = min_bin_freq
         self.n_jobs = n_jobs
+        self.max_iter = max_iter
 
     def fit(self, X, y=None):
         """Perform clustering.
@@ -415,11 +368,78 @@ def fit(self, X, y=None):
 
         """
         X = check_array(X)
-        self.cluster_centers_, self.labels_ = \
-            mean_shift(X, bandwidth=self.bandwidth, seeds=self.seeds,
-                       min_bin_freq=self.min_bin_freq,
-                       bin_seeding=self.bin_seeding,
-                       cluster_all=self.cluster_all, n_jobs=self.n_jobs)
+        bandwidth = self.bandwidth
+        if bandwidth is None:
+            bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)
+        elif bandwidth <= 0:
+            raise ValueError("bandwidth needs to be greater than zero or None,"
+                             " got %f" % bandwidth)
+
+        seeds = self.seeds
+        if seeds is None:
+            if self.bin_seeding:
+                seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq)
+            else:
+                seeds = X
+        n_samples, n_features = X.shape
+        center_intensity_dict = {}
+
+        # We use n_jobs=1 because this will be used in nested calls under
+        # parallel calls to _mean_shift_single_seed so there is no need for
+        # for further parallelism.
+        nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
+
+        # execute iterations on all seeds in parallel
+        all_res = Parallel(n_jobs=self.n_jobs)(
+            delayed(_mean_shift_single_seed)
+            (seed, X, nbrs, self.max_iter) for seed in seeds)
+        # copy results in a dictionary
+        for i in range(len(seeds)):
+            if all_res[i][1]:  # i.e. len(points_within) > 0
+                center_intensity_dict[all_res[i][0]] = all_res[i][1]
+
+        self.n_iter_ = max([x[2] for x in all_res])
+
+        if not center_intensity_dict:
+            # nothing near seeds
+            raise ValueError("No point was within bandwidth=%f of any seed."
+                             " Try a different seeding strategy \
+                             or increase the bandwidth."
+                             % bandwidth)
+
+        # POST PROCESSING: remove near duplicate points
+        # If the distance between two kernels is less than the bandwidth,
+        # then we have to remove one because it is a duplicate. Remove the
+        # one with fewer points.
+
+        sorted_by_intensity = sorted(center_intensity_dict.items(),
+                                     key=lambda tup: (tup[1], tup[0]),
+                                     reverse=True)
+        sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
+        unique = np.ones(len(sorted_centers), dtype=np.bool)
+        nbrs = NearestNeighbors(radius=bandwidth,
+                                n_jobs=self.n_jobs).fit(sorted_centers)
+        for i, center in enumerate(sorted_centers):
+            if unique[i]:
+                neighbor_idxs = nbrs.radius_neighbors([center],
+                                                      return_distance=False)[0]
+                unique[neighbor_idxs] = 0
+                unique[i] = 1  # leave the current point as unique
+        cluster_centers = sorted_centers[unique]
+
+        # ASSIGN LABELS: a point belongs to the cluster that it is closest to
+        nbrs = NearestNeighbors(n_neighbors=1,
+                                n_jobs=self.n_jobs).fit(cluster_centers)
+        labels = np.zeros(n_samples, dtype=np.int)
+        distances, idxs = nbrs.kneighbors(X)
+        if self.cluster_all:
+            labels = idxs.flatten()
+        else:
+            labels.fill(-1)
+            bool_selector = distances.flatten() <= bandwidth
+            labels[bool_selector] = idxs.flatten()[bool_selector]
+
+        self.cluster_centers_, self.labels_ = cluster_centers, labels
         return self
 
     def predict(self, X):
 
@@ -155,3 +155,16 @@ def test_bin_seeds():
                       cluster_std=0.1, random_state=0)
     test_bins = get_bin_seeds(X, 1)
     assert_array_equal(test_bins, [[0, 0], [1, 1]])
+
+
+@pytest.mark.parametrize('max_iter', [1, 100])
+def test_max_iter(max_iter):
+    clusters1, _ = mean_shift(X, max_iter=max_iter)
+    ms = MeanShift(max_iter=max_iter).fit(X)
+    clusters2 = ms.cluster_centers_
+
+    assert ms.n_iter_ <= ms.max_iter
+    assert len(clusters1) == len(clusters2)
+
+    for c1, c2 in zip(clusters1, clusters2):
+        assert np.allclose(c1, c2)