From b32d70f6ca22638325f55d97b919e1dde509a3bd Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 6 Sep 2018 16:52:53 +0200
Subject: [PATCH 1/6] Support precomputed distance matrix

---
 sklearn/cluster/optics_.py           | 36 ++++++++++++++++++++++------
 sklearn/cluster/tests/test_optics.py | 24 +++++++++++++++++++
 2 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index 5c20ddb421845..9686e01299f25 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -52,11 +52,30 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
         shorter run times.
 
     metric : string or callable, optional (default='euclidean')
-        The distance metric to use for neighborhood lookups. Default is
-        "euclidean". Other options include "minkowski", "manhattan",
-        "chebyshev", "haversine", "seuclidean", "hamming", "canberra",
-        and "braycurtis". The "wminkowski" and "mahalanobis" metrics are
-        also valid with an additional argument.
+        metric to use for distance computation. Any metric from scikit-learn
+        or scipy.spatial.distance can be used.
+
+        If metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays as input and return one value indicating the
+        distance between them. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        Distance matrices are not supported.
+
+        Valid values for metric are:
+
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+          'manhattan']
+
+        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
+          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
+          'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
+          'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
+          'yule']
+
+        See the documentation for scipy.spatial.distance for details on these
+        metrics.
 
     p : integer, optional (default=2)
         Parameter for the Minkowski metric from
@@ -398,8 +417,11 @@ def _set_reach_dist(self, point_index, X, nbrs):
                              indices, axis=0)
         # Keep n_jobs = 1 in the following lines...please
         if len(unproc) > 0:
-            dists = pairwise_distances(P, np.take(X, unproc, axis=0),
-                                       self.metric, n_jobs=None).ravel()
+            if self.metric == 'precomputed':
+                dists = X[point_index, unproc]
+            else:
+                dists = pairwise_distances(P, np.take(X, unproc, axis=0),
+                                           self.metric, n_jobs=None).ravel()
 
             rdists = np.maximum(dists, self.core_distances_[point_index])
             new_reach = np.minimum(np.take(self.reachability_, unproc), rdists)
diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 5a89cb7a0c439..7bf864f799f45 100755
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -10,6 +10,7 @@
 from sklearn.cluster.optics_ import _TreeNode, _cluster_tree
 from sklearn.cluster.optics_ import _find_local_maxima
 from sklearn.metrics.cluster import contingency_matrix
+from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.cluster.dbscan_ import DBSCAN
 from sklearn.utils.testing import assert_equal, assert_warns
 from sklearn.utils.testing import assert_array_equal
@@ -413,3 +414,26 @@ def test_reach_dists():
     else:
         # we compare to truncated decimals, so use atol
         assert_allclose(clust.reachability_, np.array(v), atol=1e-5)
+
+
+def test_precomputed_dists():
+    rng = np.random.RandomState(0)
+    n_points_per_cluster = 50
+
+    C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2)
+    C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2)
+    C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2)
+    C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2)
+    C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2)
+    C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2)
+    X = np.vstack((C1, C2, C3, C4, C5, C6))
+
+    dists = pairwise_distances(X, metric='euclidean')
+    clust1 = OPTICS(min_samples=10, algorithm='brute',
+                    metric='precomputed').fit(dists)
+    clust2 = OPTICS(min_samples=10, algorithm='brute',
+                    metric='euclidean').fit(X)
+
+    assert_allclose(clust1.reachability_, clust2.reachability_)
+    assert_array_equal(clust1.labels_, clust2.labels_)
+    assert_array_equal(clust1.ordering_, clust2.ordering_)

From 414be06f268f6e6209d327aacb8bcf4bd8af4a61 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 6 Sep 2018 17:10:13 +0200
Subject: [PATCH 2/6] sync docstrings

---
 sklearn/cluster/optics_.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index 9686e01299f25..2d167a06c1bae 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -199,11 +199,30 @@ class OPTICS(BaseEstimator, ClusterMixin):
         shorter run times.
 
     metric : string or callable, optional (default='euclidean')
-        The distance metric to use for neighborhood lookups. Default is
-        "euclidean". Other options include "minkowski", "manhattan",
-        "chebyshev", "haversine", "seuclidean", "hamming", "canberra",
-        and "braycurtis". The "wminkowski" and "mahalanobis" metrics are
-        also valid with an additional argument.
+        metric to use for distance computation. Any metric from scikit-learn
+        or scipy.spatial.distance can be used.
+
+        If metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays as input and return one value indicating the
+        distance between them. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        Distance matrices are not supported.
+
+        Valid values for metric are:
+
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+          'manhattan']
+
+        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
+          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
+          'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
+          'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
+          'yule']
+
+        See the documentation for scipy.spatial.distance for details on these
+        metrics.
 
     p : integer, optional (default=2)
         Parameter for the Minkowski metric from
@@ -346,7 +365,7 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        X = check_array(X, dtype=np.float)
+        X = check_array(X, dtype=np.float, accept_sparse=True)
 
         n_samples = len(X)
         # Start all points as 'unprocessed' ##

From 7df42ffcc73a7d062a2088dd3bbcad1682657528 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 6 Sep 2018 17:15:43 +0200
Subject: [PATCH 3/6] remove sparse

---
 sklearn/cluster/optics_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index 2d167a06c1bae..e1d76493f87ca 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -365,7 +365,7 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        X = check_array(X, dtype=np.float, accept_sparse=True)
+        X = check_array(X, dtype=np.float)
 
         n_samples = len(X)
         # Start all points as 'unprocessed' ##

From b1437c001eecdace437b80f121a18ccfc8e0530f Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 10 Sep 2018 12:55:22 +0200
Subject: [PATCH 4/6] put back reading from precomputed matrix

---
 sklearn/cluster/optics_.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index f884ccc59177c..94ff3935002a6 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -457,8 +457,11 @@ def _set_reach_dist(self, point_index, processed, X, nbrs):
             # Everything is already processed. Return to main loop
             return point_index
 
-        dists = pairwise_distances(P, np.take(X, unproc, axis=0),
-                                   self.metric, n_jobs=1).ravel()
+        if self.metric == 'precomputed':
+            dists = X[point_index, unproc]
+        else:
+            dists = pairwise_distances(P, np.take(X, unproc, axis=0),
+                                       self.metric, n_jobs=None).ravel()
 
         rdists = np.maximum(dists, self.core_distances_[point_index])
         new_reach = np.minimum(np.take(self.reachability_, unproc), rdists)

From 5b510adae91ec5b42bbb6bc02721195c8f167104 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 10 Sep 2018 13:04:53 +0200
Subject: [PATCH 5/6] simplify test

---
 sklearn/cluster/optics_.py           |  2 +-
 sklearn/cluster/tests/test_optics.py | 17 +++--------------
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index 94ff3935002a6..a5e7510e5a03b 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -369,7 +369,7 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        X = check_array(X, dtype=np.float)
+        X = check_array(X, dtype=np.float64)
 
         n_samples = len(X)
 
diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 762505ca4acda..1215746faa4c3 100755
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -440,23 +440,12 @@ def test_reach_dists():
 
 
 def test_precomputed_dists():
-    rng = np.random.RandomState(0)
-    n_points_per_cluster = 50
-
-    C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2)
-    C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2)
-    C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2)
-    C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2)
-    C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2)
-    C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2)
-    X = np.vstack((C1, C2, C3, C4, C5, C6))
-
-    dists = pairwise_distances(X, metric='euclidean')
+    redX = X[::10]
+    dists = pairwise_distances(redX, metric='euclidean')
     clust1 = OPTICS(min_samples=10, algorithm='brute',
                     metric='precomputed').fit(dists)
     clust2 = OPTICS(min_samples=10, algorithm='brute',
-                    metric='euclidean').fit(X)
+                    metric='euclidean').fit(redX)
 
     assert_allclose(clust1.reachability_, clust2.reachability_)
     assert_array_equal(clust1.labels_, clust2.labels_)
-    assert_array_equal(clust1.ordering_, clust2.ordering_)

From 56cba779d9f0ec16e27d96556e73abbba575f634 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 10 Sep 2018 13:05:29 +0200
Subject: [PATCH 6/6] revert to float

---
 sklearn/cluster/optics_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index a5e7510e5a03b..94ff3935002a6 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -369,7 +369,7 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        X = check_array(X, dtype=np.float64)
+        X = check_array(X, dtype=np.float)
 
         n_samples = len(X)