scikit-learn · adrinjalali · Jul 26, 2018 · Aug 19, 2018 · Aug 19, 2018 · Aug 19, 2018
diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
@@ -276,6 +276,19 @@ class OPTICS(BaseEstimator, ClusterMixin):
         Distance at which each sample becomes a core point.
         Points which will never be core have a distance of inf.
 
+    Examples
+    --------
+    >>> from sklearn.cluster import OPTICS
+    >>> import numpy as np
+    >>> np.random.seed(0)
+    >>> n_points_per_cluster = 3
+    >>> C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
+    >>> C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
+    >>> X = np.vstack((C1, np.array([[100, 200]]), C2))
+    >>> clust = OPTICS(min_samples=2).fit(X)
+    >>> clust.labels_
+    array([ 0,  0,  0, -1,  1,  1,  1])
+
     See also
     --------
 
@@ -374,6 +387,7 @@ def fit(self, X, y=None):
 
         indices_, self.labels_ = _extract_optics(self.ordering_,
                                                  self.reachability_,
+                                                 self.core_distances_,
                                                  self.maxima_ratio,
                                                  self.rejection_ratio,
                                                  self.similarity_threshold,
@@ -505,7 +519,7 @@ def _extract_dbscan(ordering, core_distances, reachability, eps):
     return np.arange(n_samples)[is_core], labels
 
 
-def _extract_optics(ordering, reachability, maxima_ratio=.75,
+def _extract_optics(ordering, reachability, core_distances, maxima_ratio=.75,
                     rejection_ratio=.7, similarity_threshold=0.4,
                     significant_min=.003, min_cluster_size=.005,
                     min_maxima_ratio=0.001):
@@ -519,6 +533,9 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75,
     reachability : array, shape (n_samples,)
         Reachability distances calculated by OPTICS (`reachability_`)
 
+    core_distances : array, shape (n_samples,)
+        Core distances calculated by OPTICS (`core_distances_`)
+
     maxima_ratio : float, optional
         The maximum ratio we allow of average height of clusters on the
         right and left to the local maxima in question. The higher the
@@ -563,9 +580,18 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75,
     """
 
     # Extraction wrapper
-    reachability = reachability / np.max(reachability[1:])
+    # according to the paper (p. 5), for a small enough generative distance
+    # epsilong, there should be more than one INF.
+    if np.all(np.isinf(reachability)):
+        raise ValueError("All reachability values are inf. Set a larger"
+                         " max_eps.")
+    normalization_factor = np.max(reachability[reachability < np.inf])
+    reachability = reachability / normalization_factor
     reachability_plot = reachability[ordering].tolist()
-    root_node = _automatic_cluster(reachability_plot, ordering,
+    core_distances = core_distances / normalization_factor
+    core_distances_plot = core_distances[ordering].tolist()
+    root_node = _automatic_cluster(reachability_plot, core_distances_plot,
+                                   ordering,
                                    maxima_ratio, rejection_ratio,
                                    similarity_threshold, significant_min,
                                    min_cluster_size, min_maxima_ratio)
@@ -581,10 +607,18 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75,
         labels[index] = clustid
         is_core[index] = 1
         clustid += 1
+
+    # check if the last point in the ordering is a NOISE
+    last_point = ordering[-1]
+    if (core_distances_plot[-1] * 5 >= reachability_plot[-1]
+            and reachability_plot[-1] > reachability_plot[-2] * 5):
+        labels[last_point] = -1
+        is_core[last_point] = 0
+
     return np.arange(n_samples)[is_core], labels
 
 
-def _automatic_cluster(reachability_plot, ordering,
+def _automatic_cluster(reachability_plot, core_distances_plot, ordering,
                        maxima_ratio, rejection_ratio,
                        similarity_threshold, significant_min,
                        min_cluster_size, min_maxima_ratio):
@@ -611,7 +645,8 @@ def _automatic_cluster(reachability_plot, ordering,
                                              neighborhood_size)
     root_node = _TreeNode(ordering, 0, len(ordering), None)
     _cluster_tree(root_node, None, local_maxima_points,
-                  reachability_plot, ordering, min_cluster_size,
+                  reachability_plot, core_distances_plot,
+                  ordering, min_cluster_size,
                   maxima_ratio, rejection_ratio,
                   similarity_threshold, significant_min)
 
@@ -644,7 +679,8 @@ def _find_local_maxima(reachability_plot, neighborhood_size):
         # if the point is a local maxima on the reachability plot with
         # regard to neighborhood_size, insert it into priority queue and
         # maxima list
-        if (reachability_plot[i] > reachability_plot[i - 1] and
+        # all inf values are local maxima.
+        if (reachability_plot[i] >= reachability_plot[i - 1] and
             reachability_plot[i] >= reachability_plot[i + 1] and
             _is_local_maxima(i, np.array(reachability_plot),
                              neighborhood_size) == 1):
@@ -655,7 +691,7 @@ def _find_local_maxima(reachability_plot, neighborhood_size):
 
 
 def _cluster_tree(node, parent_node, local_maxima_points,
-                  reachability_plot, reachability_ordering,
+                  reachability_plot, core_distances_plot, ordering,
                   min_cluster_size, maxima_ratio, rejection_ratio,
                   similarity_threshold, significant_min):
     """Recursively builds cluster tree to hold hierarchical cluster structure
@@ -665,7 +701,6 @@ def _cluster_tree(node, parent_node, local_maxima_points,
     local_maxima_points is list of local maxima points sorted in
     descending order of reachability
     """
-
     if len(local_maxima_points) == 0:
         return  # parent_node is a leaf
 
@@ -675,10 +710,13 @@ def _cluster_tree(node, parent_node, local_maxima_points,
     local_maxima_points = local_maxima_points[1:]
 
     # create two new nodes and add to list of nodes
-    node_1 = _TreeNode(reachability_ordering[node.start:s],
+    node_1 = _TreeNode(ordering[node.start:s],
                        node.start, s, node)
-    node_2 = _TreeNode(reachability_ordering[s + 1:node.end],
-                       s + 1, node.end, node)
+    node_2_start = s + 1
+    if core_distances_plot[s] * 5 < reachability_plot[s]:
+        node_2_start = s
+    node_2 = _TreeNode(ordering[node_2_start:node.end],
+                       node_2_start, node.end, node)
     local_max_1 = []
     local_max_2 = []
 
@@ -700,6 +738,8 @@ def _cluster_tree(node, parent_node, local_maxima_points,
     # only check a certain ratio of points in the child
     # nodes formed to the left and right of the maxima
     # ...should check_ratio be a user settable parameter?
+    # NOTE: based on the description of maxima_ratio I'd guess this value
+    # should be 1, why is it less than 1?
     check_ratio = .8
     check_value_1 = int(np.round(check_ratio * len(node_1.points)))
     check_value_2 = int(np.round(check_ratio * len(node_2.points)))
@@ -708,22 +748,22 @@ def _cluster_tree(node, parent_node, local_maxima_points,
     avg_reach2 = np.mean(reachability_plot[node_2.start:(node_2.start
                                                          + check_value_2)])
 
-    if ((avg_reach1 / reachability_plot[s]) > maxima_ratio or
-            (avg_reach2 / reachability_plot[s]) > maxima_ratio):
+    if ((avg_reach1 / maxima_ratio) > reachability_plot[s] or
+            (avg_reach2 / maxima_ratio) > reachability_plot[s]):
 
-        if (avg_reach1 / reachability_plot[s]) < rejection_ratio:
        if (avg_reach1 / rejection_ratio) < reachability_plot[s]:
             # reject node 2
             node_list.remove((node_2, local_max_2))
-        if (avg_reach2 / reachability_plot[s]) < rejection_ratio:
+        if (avg_reach2 / rejection_ratio) < reachability_plot[s]:
             # reject node 1
             node_list.remove((node_1, local_max_1))
-        if ((avg_reach1 / reachability_plot[s]) >= rejection_ratio and
-                (avg_reach2 / reachability_plot[s]) >= rejection_ratio):
+        if ((avg_reach1 / rejection_ratio) >= reachability_plot[s] and
+                (avg_reach2 / rejection_ratio) >= reachability_plot[s]):
             # since split_point is not significant,
             # ignore this split and continue (reject both child nodes)
             node.split_point = -1
             _cluster_tree(node, parent_node, local_maxima_points,
-                          reachability_plot, reachability_ordering,
+                          reachability_plot, core_distances_plot, ordering,
                           min_cluster_size, maxima_ratio, rejection_ratio,
                           similarity_threshold, significant_min)
             return
@@ -756,13 +796,13 @@ def _cluster_tree(node, parent_node, local_maxima_points,
         if bypass_node == 1:
             parent_node.children.append(nl[0])
             _cluster_tree(nl[0], parent_node, nl[1],
-                          reachability_plot, reachability_ordering,
+                          reachability_plot, core_distances_plot, ordering,
                           min_cluster_size, maxima_ratio, rejection_ratio,
                           similarity_threshold, significant_min)
         else:
             node.children.append(nl[0])
             _cluster_tree(nl[0], node, nl[1], reachability_plot,
-                          reachability_ordering, min_cluster_size,
+                          core_distances_plot, ordering, min_cluster_size,
                           maxima_ratio, rejection_ratio,
                           similarity_threshold, significant_min)
 

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
@@ -87,17 +87,27 @@ def test_empty_extract():
 
 def test_bad_extract():
     # Test an extraction of eps too close to original eps
-    msg = "Specify an epsilon smaller than 0.015. Got 0.3."
+    msg = "Specify an epsilon smaller than 0.15. Got 0.3."
     centers = [[1, 1], [-1, -1], [1, -1]]
     X, labels_true = make_blobs(n_samples=750, centers=centers,
                                 cluster_std=0.4, random_state=0)
 
     # Compute OPTICS
-    clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10)
+    clust = OPTICS(max_eps=5.0 * 0.03, min_samples=10)
     clust2 = clust.fit(X)
     assert_raise_message(ValueError, msg, clust2.extract_dbscan, 0.3)
 
 
+def test_bad_reachability():
+    msg = "All reachability values are inf. Set a larger max_eps."
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(n_samples=750, centers=centers,
+                                cluster_std=0.4, random_state=0)
+
+    clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10)
+    assert_raise_message(ValueError, msg, clust.fit, X)
+
+
 def test_close_extract():
     # Test extract where extraction eps is close to scaled epsPrime
 
@@ -173,18 +183,76 @@ def test_min_cluster_size_invalid(min_cluster_size):
         clust.fit(X)
 
 
+def test_auto_extract_no_outlier():
+    np.random.seed(0)
+    n_points_per_cluster = 100
+    C1 = [-50, -20] + .08 * np.random.randn(n_points_per_cluster, 2)
+    C2 = [40, -10] + .01 * np.random.randn(n_points_per_cluster, 2)
+    C3 = [10, -20] + .02 * np.random.randn(n_points_per_cluster, 2)
+    C4 = [-20, 30] + .03 * np.random.randn(n_points_per_cluster, 2)
+    C5 = [30, -20] + .06 * np.random.randn(n_points_per_cluster, 2)
+    C6 = [50, 60] + .02 * np.random.randn(n_points_per_cluster, 2)
+    X = np.vstack((C1, C2, C3, C4, C5, C6))
+
+    clust = OPTICS(min_samples=4)
+
+    clust.fit(X)
+
+    # there should be no outliers detected
+    assert_equal(set(clust.labels_), set([0, 1, 2, 3, 4, 5]))
+
+
+def test_multi_inf_in_reachability():
+    np.random.seed(0)
+    n_points_per_cluster = 3
+
+    C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
+    C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
+    X = np.vstack((C1, np.array([[100, 200], [200, 300]]), C2))
+
+    clust = OPTICS(min_samples=2, max_eps=2).fit(X)
+
+    assert_array_equal(clust.labels_, np.r_[[0] * 3, -1, -1, [1] * 3])
+    assert_array_equal(clust.reachability_[[0, 3, 4, 5]], [np.inf] * 4)
+
+
+def test_auto_extract_outlier():
+    np.random.seed(0)
+    n_points_per_cluster = 4
+
+    C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
+    C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
+    X = np.vstack((C1, C2, np.array([[100, 200]])))
+
+    clust = OPTICS(min_samples=3).fit(X)
+
+    assert_array_equal(clust.labels_, np.r_[[0] * 4, [1] * 4, -1])
+
+    C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
+    C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
+    X = np.vstack((C1, np.array([[100, 200], [200, 300]]), C2))
+
+    clust = OPTICS(min_samples=3).fit(X)
+
+    assert_array_equal(clust.labels_, np.r_[[0] * 4, -1, -1, [1] * 4])
+
+
 def test_min_cluster_size_invalid2():
     clust = OPTICS(min_cluster_size=len(X) + 1)
     with pytest.raises(ValueError, match="must be no greater than the "):
         clust.fit(X)
 
 
-@pytest.mark.parametrize("reach, n_child, members", [
+@pytest.mark.parametrize("reach, core_dist, n_child, members", [
     (np.array([np.inf, 0.9, 0.9, 1.0, 0.89, 0.88, 10, .9, .9, .9, 10, 0.9,
-               0.9, 0.89, 0.88, 10, .9, .9, .9, .9]), 2, np.r_[0:6]),
+               0.9, 0.89, 0.88, 10, .9, .9, .9, .9]),
+     np.array([np.inf, 0.9, 0.9, 1.0, 0.89, 0.88, 1, .9, .9, .9, 1, 0.9,
+               0.9, 0.89, 0.88, 1, .9, .9, .9, .9]), 2, np.r_[0:6]),
     (np.array([np.inf, 0.9, 0.9, 0.9, 0.89, 0.88, 10, .9, .9, .9, 10, 0.9,
-               0.9, 0.89, 0.88, 100, .9, .9, .9, .9]), 1, np.r_[0:15])])
-def test_cluster_sigmin_pruning(reach, n_child, members):
+               0.9, 0.89, 0.88, 100, .9, .9, .9, .9]),
+     np.array([np.inf, 0.9, 0.9, 0.9, 0.89, 0.88, 1, .9, .9, .9, 1, 0.9,
+               0.9, 0.89, 0.88, 1, .9, .9, .9, .9]), 1, np.r_[0:15])])
+def test_cluster_sigmin_pruning(reach, core_dist, n_child, members):
     # Tests pruning left and right, insignificant splitpoints, empty nodelists
     # Parameters chosen specifically for this task
 
@@ -198,7 +266,7 @@ def test_cluster_sigmin_pruning(reach, n_child, members):
     root = _TreeNode(ordering, 0, 20, None)
 
     # Build cluster tree inplace on root node
-    _cluster_tree(root, None, cluster_boundaries, reach, ordering,
+    _cluster_tree(root, None, cluster_boundaries, reach, core_dist, ordering,
                   5, .75, .7, .4, .3)
     assert_equal(root.split_point, cluster_boundaries[0])
     assert_equal(n_child, len(root.children))