diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 899da518ae796..94ff3935002a6 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -52,11 +52,30 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', shorter run times. metric : string or callable, optional (default='euclidean') - The distance metric to use for neighborhood lookups. Default is - "euclidean". Other options include "minkowski", "manhattan", - "chebyshev", "haversine", "seuclidean", "hamming", "canberra", - and "braycurtis". The "wminkowski" and "mahalanobis" metrics are - also valid with an additional argument. + metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'yule'] + + See the documentation for scipy.spatial.distance for details on these + metrics. p : integer, optional (default=2) Parameter for the Minkowski metric from @@ -182,11 +201,30 @@ class OPTICS(BaseEstimator, ClusterMixin): shorter run times. metric : string or callable, optional (default='euclidean') - The distance metric to use for neighborhood lookups. Default is - "euclidean". Other options include "minkowski", "manhattan", - "chebyshev", "haversine", "seuclidean", "hamming", "canberra", - and "braycurtis". The "wminkowski" and "mahalanobis" metrics are - also valid with an additional argument. + metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'yule'] + + See the documentation for scipy.spatial.distance for details on these + metrics. p : integer, optional (default=2) Parameter for the Minkowski metric from @@ -419,8 +457,11 @@ def _set_reach_dist(self, point_index, processed, X, nbrs): # Everything is already processed. Return to main loop return point_index - dists = pairwise_distances(P, np.take(X, unproc, axis=0), - self.metric, n_jobs=1).ravel() + if self.metric == 'precomputed': + dists = X[point_index, unproc] + else: + dists = pairwise_distances(P, np.take(X, unproc, axis=0), + self.metric, n_jobs=None).ravel() rdists = np.maximum(dists, self.core_distances_[point_index]) new_reach = np.minimum(np.take(self.reachability_, unproc), rdists) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index bddf57ec7b5d1..1215746faa4c3 100755 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -11,6 +11,7 @@ from sklearn.cluster.optics_ import _TreeNode, _cluster_tree from sklearn.cluster.optics_ import _find_local_maxima from sklearn.metrics.cluster import contingency_matrix +from sklearn.metrics.pairwise import pairwise_distances from sklearn.cluster.dbscan_ import DBSCAN from sklearn.utils.testing import assert_equal, assert_warns from sklearn.utils.testing import assert_array_equal @@ -436,3 +437,15 @@ def test_reach_dists(): else: # we compare to truncated decimals, so use atol assert_allclose(clust.reachability_, np.array(v), atol=1e-5) + + +def test_precomputed_dists(): + redX = X[::10] + dists = pairwise_distances(redX, metric='euclidean') + clust1 = OPTICS(min_samples=10, algorithm='brute', + metric='precomputed').fit(dists) + clust2 = OPTICS(min_samples=10, algorithm='brute', + metric='euclidean').fit(redX) + + assert_allclose(clust1.reachability_, clust2.reachability_) + assert_array_equal(clust1.labels_, clust2.labels_)