8000 DOC Fix optics metric issues (DOC and precomputed) (#12028) · scikit-learn/scikit-learn@efe7b8c · GitHub
[go: up one dir, main page]

Skip to content

Commit efe7b8c

Browse files
adrinjalaliqinhanmin2014
authored andcommitted
DOC Fix optics metric issues (DOC and precomputed) (#12028)
1 parent 6de7957 commit efe7b8c

File tree

2 files changed

+66
-12
lines changed

2 files changed

+66
-12
lines changed

sklearn/cluster/optics_.py

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,30 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
5252
shorter run times.
5353
5454
metric : string or callable, optional (default='euclidean')
55-
The distance metric to use for neighborhood lookups. Default is
56-
"euclidean". Other options include "minkowski", "manhattan",
57-
"chebyshev", "haversine", "seuclidean", "hamming", "canberra",
58-
and "braycurtis". The "wminkowski" and "mahalanobis" metrics are
59-
also valid with an additional argument.
55+
metric to use for distance computation. Any metric from scikit-learn
56+
or scipy.spatial.distance can be used.
57+
58+
If metric is a callable function, it is called on each
59+
pair of instances (rows) and the resulting value recorded. The callable
60+
should take two arrays as input and return one value indicating the
61+
distance between them. This works for Scipy's metrics, but is less
62+
efficient than passing the metric name as a string.
63+
64+
Distance matrices are not supported.
65+
66+
Valid values for metric are:
67+
68+
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
69+
'manhattan']
70+
71+
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
72+
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
73+
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
74+
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
75+
'yule']
76+
77+
See the documentation for scipy.spatial.distance for details on these
78+
metrics.
6079
6180
p : integer, optional (default=2)
6281
Parameter for the Minkowski metric from
@@ -182,11 +201,30 @@ class OPTICS(BaseEstimator, ClusterMixin):
182201
shorter run times.
183202
184203
metric : string or callable, optional (default='euclidean')
185-
The distance metric to use for neighborhood lookups. Default is
186 8000 -
"euclidean". Other options include "minkowski", "manhattan",
187-
"chebyshev", "haversine", "seuclidean", "hamming", "canberra",
188-
and "braycurtis". The "wminkowski" and "mahalanobis" metrics are
189-
also valid with an additional argument.
204+
metric to use for distance computation. Any metric from scikit-learn
205+
or scipy.spatial.distance can be used.
206+
207+
If metric is a callable function, it is called on each
208+
pair of instances (rows) and the resulting value recorded. The callable
209+
should take two arrays as input and return one value indicating the
210+
distance between them. This works for Scipy's metrics, but is less
211+
efficient than passing the metric name as a string.
212+
213+
Distance matrices are not supported.
214+
215+
Valid values for metric are:
216+
217+
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
218+
'manhattan']
219+
220+
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
221+
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
222+
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
223+
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
224+
'yule']
225+
226+
See the documentation for scipy.spatial.distance for details on these
227+
metrics.
190228
191229
p : integer, optional (default=2)
192230
Parameter for the Minkowski metric from
@@ -419,8 +457,11 @@ def _set_reach_dist(self, point_index, processed, X, nbrs):
419457
# Everything is already processed. Return to main loop
420458
return point_index
421459

422-
dists = pairwise_distances(P, np.take(X, unproc, axis=0),
423-
self.metric, n_jobs=1).ravel()
460+
if self.metric == 'precomputed':
461+
dists = X[point_index, unproc]
462+
else:
463+
dists = pairwise_distances(P, np.take(X, unproc, axis=0),
464+
self.metric, n_jobs=None).ravel()
424465

425466
rdists = np.maximum(dists, self.core_distances_[point_index])
426467
new_reach = np.minimum(np.take(self.reachability_, unproc), rdists)

sklearn/cluster/tests/test_optics.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from sklearn.cluster.optics_ import _TreeNode, _cluster_tree
1212
from sklearn.cluster.optics_ import _find_local_maxima
1313
from sklearn.metrics.cluster import contingency_matrix
14+
from sklearn.metrics.pairwise import pairwise_distances
1415
from sklearn.cluster.dbscan_ import DBSCAN
1516
from sklearn.utils.testing import assert_equal, assert_warns
1617
from sklearn.utils.testing import assert_array_equal
@@ -436,3 +437,15 @@ def test_reach_dists():
436437
else:
437438
# we compare to truncated decimals, so use atol
438439
assert_allclose(clust.reachability_, np.array(v), atol=1e-5)
440+
441+
442+
def test_precomputed_dists():
443+
redX = X[::10]
444+
dists = pairwise_distances(redX, metric='euclidean')
445+
clust1 = OPTICS(min_samples=10, algorithm='brute',
446+
metric='precomputed').fit(dists)
447+
clust2 = OPTICS(min_samples=10, algorithm='brute',
448+
metric='euclidean').fit(redX)
449+
450+
assert_allclose(clust1.reachability_, clust2.reachability_)
451+
assert_array_equal(clust1.labels_, clust2.labels_)

0 commit comments

Comments
 (0)
0