@@ -136,58 +136,12 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
136
136
DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
137
137
ACM Transactions on Database Systems (TODS), 42(3), 19.
138
138
"""
139
- if not eps > 0.0 :
140
- raise ValueError ("eps must be positive." )
141
-
142
- X = check_array (X , accept_sparse = 'csr' )
143
- if sample_weight is not None :
144
- sample_weight = np .asarray (sample_weight )
145
- check_consistent_length (X , sample_weight )
146
-
147
- # Calculate neighborhood for all samples. This leaves the original point
148
- # in, which needs to be considered later (i.e. point i is in the
149
- # neighborhood of point i. While True, its useless information)
150
- if metric == 'precomputed' and sparse .issparse (X ):
151
- neighborhoods = np .empty (X .shape [0 ], dtype = object )
152
- X .sum_duplicates () # XXX: modifies X's internals in-place
153
-
154
- # set the diagonal to explicit values, as a point is its own neighbor
155
- with warnings .catch_warnings ():
156
- warnings .simplefilter ('ignore' , sparse .SparseEfficiencyWarning )
157
- X .setdiag (X .diagonal ()) # XXX: modifies X's internals in-place
158
-
159
- X_mask = X .data <= eps
160
- masked_indices = X .indices .astype (np .intp , copy = False )[X_mask ]
161
- masked_indptr = np .concatenate (([0 ], np .cumsum (X_mask )))
162
- masked_indptr = masked_indptr [X .indptr [1 :- 1 ]]
163
-
164
- # split into rows
165
- neighborhoods [:] = np .split (masked_indices , masked_indptr )
166
- else :
167
- neighbors_model = NearestNeighbors (radius = eps , algorithm = algorithm ,
168
- leaf_size = leaf_size ,
169
- metric = metric ,
170
- metric_params = metric_params , p = p ,
171
- n_jobs = n_jobs )
172
- neighbors_model .fit (X )
173
- # This has worst case O(n^2) memory complexity
174
- neighborhoods = neighbors_model .radius_neighbors (X , eps ,
175
- return_distance = False )
176
-
177
- if sample_weight is None :
178
- n_neighbors = np .array ([len (neighbors )
179
- for neighbors in neighborhoods ])
180
- else :
181
- n_neighbors = np .array ([np .sum (sample_weight [neighbors ])
182
- for neighbors in neighborhoods ])
183
-
184
- # Initially, all samples are noise.
185
- labels = np .full (X .shape [0 ], - 1 , dtype = np .intp )
186
-
187
- # A list of all core samples found.
188
- core_samples = np .asarray (n_neighbors >= min_samples , dtype = np .uint8 )
189
- dbscan_inner (core_samples , neighborhoods , labels )
190
- return np .where (core_samples )[0 ], labels
139
+
140
+ est = DBSCAN (eps = eps , min_samples = min_samples , metric = metric ,
141
+ metric_params = metric_params , algorithm = algorithm ,
142
+ leaf_size = leaf_size , p = p , n_jobs = n_jobs )
143
+ est .fit (X , sample_weight = sample_weight )
144
+ return est .core_sample_indices_ , est .labels_
191
145
192
146
193
147
class DBSCAN (ClusterMixin , BaseEstimator ):
@@ -353,9 +307,63 @@ def fit(self, X, y=None, sample_weight=None):
353
307
354
308
"""
355
309
X = check_array (X , accept_sparse = 'csr' )
356
- clust = dbscan (X , sample_weight = sample_weight ,
357
- ** self .get_params ())
358
- self .core_sample_indices_ , self .labels_ = clust
310
+
311
+ if not self .eps > 0.0 :
312
+ raise ValueError ("eps must be positive." )
313
+
314
+ if sample_weight is not None :
315
+ sample_weight = np .asarray (sample_weight )
316
+ check_consistent_length (X , sample_weight )
317
+
318
+ # Calculate neighborhood for all samples. This leaves the original
319
+ # point in, which needs to be considered later (i.e. point i is in the
320
+ # neighborhood of point i. While True, its useless information)
321
+ if self .metric == 'precomputed' and sparse .issparse (X ):
322
+ neighborhoods = np .empty (X .shape [0 ], dtype = object )
323
+ X .sum_duplicates () # XXX: modifies X's internals in-place
324
+
325
+ # set the diagonal to explicit values, as a point is its own
326
+ # neighbor
327
+ with warnings .catch_warnings ():
328
+ warnings .simplefilter ('ignore' , sparse .SparseEfficiencyWarning )
329
+ X .setdiag (X .diagonal ()) # XXX: modifies X's internals in-place
330
+
331
+ X_mask = X .data <= self .eps
332
+ masked_indices = X .indices .astype (np .intp , copy = False )[X_mask ]
333
+ masked_indptr = np .concatenate (([0 ], np .cumsum (X_mask )))
334
+ masked_indptr = masked_indptr [X .indptr [1 :- 1 ]]
335
+
336
+ # split into rows
337
+ neighborhoods [:] = np .split (masked_indices , masked_indptr )
338
+ else :
339
+ neighbors_model = NearestNeighbors (
340
+ radius = self .eps , algorithm = self .algorithm ,
341
+ leaf_size = self .leaf_size , metric = self .metric ,
342
+ metric_params = self .metric_params , p = self .p , n_jobs = self .n_jobs
343
+ )
344
+ neighbors_model .fit (X )
345
+ # This has worst case O(n^2) memory complexity
346
+ neighborhoods = neighbors_model .radius_neighbors (
347
+ X , self .eps , return_distance = False )
348
+
349
+ if sample_weight is None :
350
+ n_neighbors = np .array ([len (neighbors )
351
+ for neighbors in neighborhoods ])
352
+ else :
353
+ n_neighbors = np .array ([np .sum (sample_weight [neighbors ])
354
+ for neighbors in neighborhoods ])
355
+
356
+ # Initially, all samples are noise.
357
+ labels = np .full (X .shape [0 ], - 1 , dtype = np .intp )
358
+
359
+ # A list of all core samples found.
360
+ core_samples = np .asarray (n_neighbors >= self .min_samples ,
361
+ dtype = np .uint8 )
362
+ dbscan_inner (core_samples , neighborhoods , labels )
363
+
364
+ self .core_sample_indices_ = np .where (core_samples )[0 ]
365
+ self .labels_ = labels
366
+
359
367
if len (self .core_sample_indices_ ):
360
368
# fix for scipy sparse indexing issue
361
369
self .components_ = X [self .core_sample_indices_ ].copy ()
0 commit comments