ENH: optimize DBSCAN (~10% faster on my data)

larsmans · larsmans · commit b73eae05e9f1 · 2015-01-23T10:48:26.000+01:00
diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
@@ -141,13 +141,14 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
         candidates = [index]
         while len(candidates) > 0:
             cand_neighbors = np.concatenate(np.take(neighborhoods, candidates,
-                                                    axis=0).tolist())
+                                                    axis=0))
             cand_neighbors = np.unique(cand_neighbors)
             noise = cand_neighbors[labels.take(cand_neighbors) == -1]
             labels[noise] = label_num
             # A candidate is a core point in the current cluster that has
             # not yet been used to expand the current cluster.
-            candidates = np.intersect1d(noise, core_samples)
+            candidates = np.intersect1d(noise, core_samples,
+                                        assume_unique=True)
         # Current cluster finished.
         # Next core point found will start a new cluster.
         label_num += 1
@@ -202,7 +203,7 @@ class DBSCAN(BaseEstimator, ClusterMixin):
 
     Notes
     -----
-    See examples/plot_dbscan.py for an example.
+    See examples/cluster/plot_dbscan.py for an example.
 
     References
     ----------