From b32d70f6ca22638325f55d97b919e1dde509a3bd Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 6 Sep 2018 16:52:53 +0200 Subject: [PATCH 1/6] Support precomputed distance matrix --- sklearn/cluster/optics_.py | 36 ++++++++++++++++++++++------ sklearn/cluster/tests/test_optics.py | 24 +++++++++++++++++++ 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 5c20ddb421845..9686e01299f25 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -52,11 +52,30 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', shorter run times. metric : string or callable, optional (default='euclidean') - The distance metric to use for neighborhood lookups. Default is - "euclidean". Other options include "minkowski", "manhattan", - "chebyshev", "haversine", "seuclidean", "hamming", "canberra", - and "braycurtis". The "wminkowski" and "mahalanobis" metrics are - also valid with an additional argument. + metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'yule'] + + See the documentation for scipy.spatial.distance for details on these + metrics. p : integer, optional (default=2) Parameter for the Minkowski metric from @@ -398,8 +417,11 @@ def _set_reach_dist(self, point_index, X, nbrs): indices, axis=0) # Keep n_jobs = 1 in the following lines...please if len(unproc) > 0: - dists = pairwise_distances(P, np.take(X, unproc, axis=0), - self.metric, n_jobs=None).ravel() + if self.metric == 'precomputed': + dists = X[point_index, unproc] + else: + dists = pairwise_distances(P, np.take(X, unproc, axis=0), + self.metric, n_jobs=None).ravel() rdists = np.maximum(dists, self.core_distances_[point_index]) new_reach = np.minimum(np.take(self.reachability_, unproc), rdists) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 5a89cb7a0c439..7bf864f799f45 100755 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -10,6 +10,7 @@ from sklearn.cluster.optics_ import _TreeNode, _cluster_tree from sklearn.cluster.optics_ import _find_local_maxima from sklearn.metrics.cluster import contingency_matrix +from sklearn.metrics.pairwise import pairwise_distances from sklearn.cluster.dbscan_ import DBSCAN from sklearn.utils.testing import assert_equal, assert_warns from sklearn.utils.testing import assert_array_equal @@ -413,3 +414,26 @@ def test_reach_dists(): else: # we compare to truncated decimals, so use atol assert_allclose(clust.reachability_, np.array(v), atol=1e-5) + + +def test_precomputed_dists(): + rng = np.random.RandomState(0) + n_points_per_cluster = 50 + + C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2) + C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2) + C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2) + C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2) + C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2) + C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2) + X = np.vstack((C1, C2, C3, C4, C5, C6)) + + dists = pairwise_distances(X, metric='euclidean') + clust1 = OPTICS(min_samples=10, algorithm='brute', + metric='precomputed').fit(dists) + clust2 = OPTICS(min_samples=10, algorithm='brute', + metric='euclidean').fit(X) + + assert_allclose(clust1.reachability_, clust2.reachability_) + assert_array_equal(clust1.labels_, clust2.labels_) + assert_array_equal(clust1.ordering_, clust2.ordering_) From 414be06f268f6e6209d327aacb8bcf4bd8af4a61 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 6 Sep 2018 17:10:13 +0200 Subject: [PATCH 2/6] sync docstrings --- sklearn/cluster/optics_.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 9686e01299f25..2d167a06c1bae 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -199,11 +199,30 @@ class OPTICS(BaseEstimator, ClusterMixin): shorter run times. metric : string or callable, optional (default='euclidean') - The distance metric to use for neighborhood lookups. Default is - "euclidean". Other options include "minkowski", "manhattan", - "chebyshev", "haversine", "seuclidean", "hamming", "canberra", - and "braycurtis". The "wminkowski" and "mahalanobis" metrics are - also valid with an additional argument. + metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'yule'] + + See the documentation for scipy.spatial.distance for details on these + metrics. p : integer, optional (default=2) Parameter for the Minkowski metric from @@ -346,7 +365,7 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - X = check_array(X, dtype=np.float) + X = check_array(X, dtype=np.float, accept_sparse=True) n_samples = len(X) # Start all points as 'unprocessed' ## From 7df42ffcc73a7d062a2088dd3bbcad1682657528 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 6 Sep 2018 17:15:43 +0200 Subject: [PATCH 3/6] remove sparse --- sklearn/cluster/optics_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 2d167a06c1bae..e1d76493f87ca 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -365,7 +365,7 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - X = check_array(X, dtype=np.float, accept_sparse=True) + X = check_array(X, dtype=np.float) n_samples = len(X) # Start all points as 'unprocessed' ## From b1437c001eecdace437b80f121a18ccfc8e0530f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Sep 2018 12:55:22 +0200 Subject: [PATCH 4/6] put back reading from precomputed matrix --- sklearn/cluster/optics_.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index f884ccc59177c..94ff3935002a6 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -457,8 +457,11 @@ def _set_reach_dist(self, point_index, processed, X, nbrs): # Everything is already processed. Return to main loop return point_index - dists = pairwise_distances(P, np.take(X, unproc, axis=0), - self.metric, n_jobs=1).ravel() + if self.metric == 'precomputed': + dists = X[point_index, unproc] + else: + dists = pairwise_distances(P, np.take(X, unproc, axis=0), + self.metric, n_jobs=None).ravel() rdists = np.maximum(dists, self.core_distances_[point_index]) new_reach = np.minimum(np.take(self.reachability_, unproc), rdists) From 5b510adae91ec5b42bbb6bc02721195c8f167104 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Sep 2018 13:04:53 +0200 Subject: [PATCH 5/6] simplify test --- sklearn/cluster/optics_.py | 2 +- sklearn/cluster/tests/test_optics.py | 17 +++-------------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 94ff3935002a6..a5e7510e5a03b 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -369,7 +369,7 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - X = check_array(X, dtype=np.float) + X = check_array(X, dtype=np.float64) n_samples = len(X) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 762505ca4acda..1215746faa4c3 100755 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -440,23 +440,12 @@ def test_reach_dists(): def test_precomputed_dists(): - rng = np.random.RandomState(0) - n_points_per_cluster = 50 - - C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2) - C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2) - C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2) - C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2) - C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2) - C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2) - X = np.vstack((C1, C2, C3, C4, C5, C6)) - - dists = pairwise_distances(X, metric='euclidean') + redX = X[::10] + dists = pairwise_distances(redX, metric='euclidean') clust1 = OPTICS(min_samples=10, algorithm='brute', metric='precomputed').fit(dists) clust2 = OPTICS(min_samples=10, algorithm='brute', - metric='euclidean').fit(X) + metric='euclidean').fit(redX) assert_allclose(clust1.reachability_, clust2.reachability_) assert_array_equal(clust1.labels_, clust2.labels_) - assert_array_equal(clust1.ordering_, clust2.ordering_) From 56cba779d9f0ec16e27d96556e73abbba575f634 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Sep 2018 13:05:29 +0200 Subject: [PATCH 6/6] revert to float --- sklearn/cluster/optics_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index a5e7510e5a03b..94ff3935002a6 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -369,7 +369,7 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - X = check_array(X, dtype=np.float64) + X = check_array(X, dtype=np.float) n_samples = len(X)