24
24
def optics (X , min_samples = 5 , max_eps = np .inf , metric = 'euclidean' ,
25
25
p = 2 , metric_params = None , maxima_ratio = .75 ,
26
26
rejection_ratio = .7 , similarity_threshold = 0.4 ,
27
- significant_min = .003 , min_cluster_size_ratio = .005 ,
27
+ significant_min = .003 , min_cluster_size = .005 ,
28
28
min_maxima_ratio = 0.001 , algorithm = 'ball_tree' ,
29
29
leaf_size = 30 , n_jobs = None ):
30
30
"""Perform OPTICS clustering from vector array
@@ -93,8 +93,10 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
93
93
significant_min : float, optional (default=.003)
94
94
Sets a lower threshold on how small a significant maxima can be.
95
95
96
- min_cluster_size_ratio : float, optional (default=.005)
97
- Minimum percentage of dataset expected for cluster membership.
96
+ min_cluster_size : int > 1 or float between 0 and 1 (default=0.005)
97
+ Minimum number of samples in an OPTICS cluster, expressed as an
98
+ absolute number or a fraction of the number of samples (rounded
99
+ to be at least 2).
98
100
99
101
min_maxima_ratio : float, optional (default=.001)
100
102
Used to determine neighborhood size for minimum cluster membership.
@@ -151,7 +153,7 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
151
153
clust = OPTICS (min_samples , max_eps , metric , p , metric_params ,
152
154
maxima_ratio , rejection_ratio ,
153
155
similarity_threshold , significant_min ,
154
- min_cluster_size_ratio , min_maxima_ratio ,
156
+ min_cluster_size , min_maxima_ratio ,
155
157
algorithm , leaf_size , n_jobs )
156
158
clust .fit (X )
157
159
return clust .core_sample_indices_ , clust .labels_
@@ -221,8 +223,10 @@ class OPTICS(BaseEstimator, ClusterMixin):
221
223
significant_min : float, optional (default=.003)
222
224
Sets a lower threshold on how small a significant maxima can be.
223
225
224
- min_cluster_size_ratio : float, optional (default=.005)
225
- Minimum percentage of dataset expected for cluster membership.
226
+ min_cluster_size : int > 1 or float between 0 and 1 (default=0.005)
227
+ Minimum number of samples in an OPTICS cluster, expressed as an
228
+ absolute number or a fraction of the number of samples (rounded
229
+ to be at least 2).
226
230
227
231
min_maxima_ratio : float, optional (default=.001)
228
232
Used to determine neighborhood size for minimum cluster membership.
@@ -289,7 +293,7 @@ class OPTICS(BaseEstimator, ClusterMixin):
289
293
def __init__ (self , min_samples = 5 , max_eps = np .inf , metric = 'euclidean' ,
290
294
p = 2 , metric_params = None , maxima_ratio = .75 ,
291
295
rejection_ratio = .7 , similarity_threshold = 0.4 ,
292
- significant_min = .003 ,
685C
min_cluster_size_ratio = .005 ,
296
+ significant_min = .003 , min_cluster_size = .005 ,
293
297
min_maxima_ratio = 0.001 , algorithm = 'ball_tree' ,
294
298
leaf_size = 30 , n_jobs = None ):
295
299
@@ -299,7 +303,7 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean',
299
303
self .rejection_ratio = rejection_ratio
300
304
self .similarity_threshold = similarity_threshold
301
305
self .significant_min = significant_min
302
- self .min_cluster_size_ratio = min_cluster_size_ratio
306
+ self .min_cluster_size = min_cluster_size
303
307
self .min_maxima_ratio = min_maxima_ratio
304
308
self .algorithm = algorithm
305
309
self .metric = metric
@@ -330,6 +334,24 @@ def fit(self, X, y=None):
330
334
X = check_array (X , dtype = np .float )
331
335
332
336
n_samples = len (X )
337
+
338
+ if self .min_samples > n_samples :
339
+ raise ValueError ("Number of training samples (n_samples=%d) must "
340
+ "be greater than min_samples (min_samples=%d) "
341
+ "used for clustering." %
342
+ (n_samples , self .min_samples ))
343
+
344
+ if self .min_cluster_size <= 0 or (self .min_cluster_size !=
345
+ int (self .min_cluster_size )
346
+ and self .min_cluster_size > 1 ):
347
+ raise ValueError ('min_cluster_size must be a positive integer or '
348
+ 'a float between 0 and 1. Got %r' %
349
+ self .min_cluster_size )
350
+ elif self .min_cluster_size > n_samples :
351
+ raise ValueError ('min_cluster_size must be no greater than the '
352
+ 'number of samples (%d). Got %d' %
353
+ (n_samples , self .min_cluster_size ))
354
+
333
355
# Start all points as 'unprocessed' ##
334
356
self .reachability_ = np .empty (n_samples )
335
357
self .reachability_ .fill (np .inf )
@@ -338,13 +360,6 @@ def fit(self, X, y=None):
338
360
# Start all points as noise ##
339
361
self .labels_ = np .full (n_samples , - 1 , dtype = int )
340
362
341
- # Check for valid n_samples relative to min_samples
342
- if self .min_samples > n_samples :
343
- raise ValueError ("Number of training samples (n_samples=%d) must "
344
- "be greater than min_samples (min_samples=%d) "
345
- "used for clustering." %
346
- (n_samples , self .min_samples ))
347
-
348
363
nbrs = NearestNeighbors (n_neighbors = self .min_samples ,
349
364
algorithm = self .algorithm ,
350
365
leaf_size = self .leaf_size , metric = self .metric ,
@@ -363,7 +378,7 @@ def fit(self, X, y=None):
363
378
self .rejection_ratio ,
364
379
self .similarity_threshold ,
365
380
self .significant_min ,
366
- self .min_cluster_size_ratio ,
381
+ self .min_cluster_size ,
367
382
self .min_maxima_ratio )
368
383
self .core_sample_indices_ = indices_
369
384
return self
@@ -492,7 +507,7 @@ def _extract_dbscan(ordering, core_distances, reachability, eps):
492
507
493
508
def _extract_optics (ordering , reachability , maxima_ratio = .75 ,
494
509
rejection_ratio = .7 , similarity_threshold = 0.4 ,
495
- significant_min = .003 , min_cluster_size_ratio = .005 ,
510
+ significant_min = .003 , min_cluster_size = .005 ,
496
511
min_maxima_ratio = 0.001 ):
497
512
"""Performs automatic cluster extraction for variable density data.
498
513
@@ -530,8 +545,10 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75,
530
545
significant_min : float, optional
531
546
Sets a lower threshold on how small a significant maxima can be.
532
547
533
- min_cluster_size_ratio : float, optional
534
- Minimum percentage of dataset expected for cluster membership.
548
+ min_cluster_size : int > 1 or float between 0 and 1
549
+ Minimum number of samples in an OPTICS cluster, expressed as an
550
+ absolute number or a fraction of the number of samples (rounded
551
+ to be at least 2).
535
552
536
553
min_maxima_ratio : float, optional
537
554
Used to determine neighborhood size for minimum cluster membership.
@@ -551,7 +568,7 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75,
551
568
root_node = _automatic_cluster (reachability_plot , ordering ,
552
569
maxima_ratio , rejection_ratio ,
553
570
similarity_threshold , significant_min ,
554
- min_cluster_size_ratio , min_maxima_ratio )
571
+ min_cluster_size , min_maxima_ratio )
555
572
leaves = _get_leaves (root_node , [])
556
573
# Start cluster id's at 0
557
574
clustid = 0
@@ -570,7 +587,7 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75,
570
587
def _automatic_cluster (reachability_plot , ordering ,
571
588
maxima_ratio , rejection_ratio ,
572
589
similarity_threshold , significant_min ,
573
- min_cluster_size_ratio , min_maxima_ratio ):
590
+ min_cluster_size , min_maxima_ratio ):
574
591
"""Converts reachability plot to cluster tree and returns root node.
575
592
576
593
Parameters
@@ -582,13 +599,10 @@ def _automatic_cluster(reachability_plot, ordering,
582
599
"""
583
600
584
601
min_neighborhood_size = 2
585
- min_cluster_size = int (min_cluster_size_ratio * len (ordering ))
602
+ if min_cluster_size <= 1 :
603
+ min_cluster_size = max (2 , min_cluster_size * len (ordering ))
586
604
neighborhood_size = int (min_maxima_ratio * len (ordering ))
587
605
588
- # Should this check for < min_samples? Should this be public?
F42D
589
- if min_cluster_size < 5 :
590
- min_cluster_size = 5
591
-
592
606
# Again, should this check < min_samples, should the parameter be public?
593
607
if neighborhood_size < min_neighborhood_size :
594
608
neighborhood_size = min_neighborhood_size
0 commit comments