adding hierarchical extraction function

espg · espg · commit e52cc1edc509 · 2015-10-19T12:18:44.000-06:00
Added hierarchical extraction from scikit-learn#2043 scikit-learn#2043 is BSD licensed and hasn’t had any activity for 10 months, so this seems pretty kosher; authors are cited in the code
diff --git a/sklearn/cluster/optics.py b/sklearn/cluster/optics.py
@@ -11,6 +11,7 @@
 # Imports #
 
 import scipy as sp
+import numpy as np
 from ..utils import check_array
 from sklearn.neighbors import BallTree
 from sklearn.base import BaseEstimator, ClusterMixin
@@ -284,3 +285,132 @@ def _ExtractDBSCAN(setofobjects, epsilon_prime):
             else:
                 # Zero (i.e., 'False') for non-core, non-noise points #
                 setofobjects._is_core[entry] = 0
+"""
+Algorithms for extracting clusters from a reachability plot.
+"""
+
+# Author: Fredrik Appelros (fredrik.appelros@gmail.com), Carl Ekerot (kalle@implode.se)
+# License: BSD
+
+
+def hierarchical_extraction(ordering, reachability_distances, min_cluster_size,
+        significant_ratio=0.75, similarity_ratio=0.4, min_reach_ratio=0.1):
+    """
+    Constructs a tree structure from an OPTICS ordering and a set of
+    reachability distances and extracts clusters from this structure.
+    Parameters
+    ----------
+    ordering : array [n_samples]
+        Indices of the samples in the order generated by OPTICS.
+    reachability_distances : array [n_samples]
+        Reachability distance for each sample.
+    min_cluster_size : int
+        The minimum size of a cluster in number of samples.
+    significant_ratio : float
+        The ratio for the reachability score of a local maximum
+        compared to its neighbors to be considered significant.
+    similarity_ratio : float
+        The ratio for the reachability score of a split point
+        compared to the parent split point for it to be considered
+        similar.
+    min_reach_ratio : float
+        The ratio of the largest reachability score that a local
+        maximum needs to reach in order to be considered.
+    Returns
+    -------
+    labels : array [n_samples]
+        Cluster labels for each point. Noisy samples are given the label -1.
+    References
+    ----------
+    Sander, Jörg, Xuejie Qin, Zhiyong Lu, Nan Niu, and Alex Kovarsky.
+    "Automatic extraction of clusters from hierarchical clustering
+    representations." Advances in Knowledge Discovery and Data Mining (2003):
+    567-567.
+    """
+    R = np.asarray([reachability_distances[i] for i in ordering])
+    n = len(ordering)
+
+    # Find local maximas
+    L = []
+    for i in xrange(0, min_cluster_size):
+        if np.argmax(R[0:i + min_cluster_size + 1]) == i:
+            L.append(i)
+        if np.argmax(R[n - 2 * min_cluster_size + i:n]) == i:
+            L.append(n - min_cluster_size + i)
+    for i in xrange(min_cluster_size, n - min_cluster_size):
+        if np.argmax(R[i - min_cluster_size:i + min_cluster_size + 1]) == min_cluster_size:
+            L.append(i)
+    # Sort local maximas in order of their reachability
+    L.sort(key=lambda x: R[x])
+    R_max = R[L[-1]]
+    L = filter(lambda x: R[x] >= min_reach_ratio * R_max, L)
+
+    class Node:
+        def __init__(self, left, right):
+            self.left = left
+            self.right = right
+            self.children = []
+
+    leaves = []
+    def cluster_tree(node, parent, L):
+        if not L:
+            leaves.append(node)
+            return
+
+        s = node.split = L.pop()
+        child_left = Node(node.left, s)
+        child_right = Node(s, node.right)
+        L_left  = [L[i] for i in np.where(np.asarray(L) < s)[0]]
+        L_right = [L[i] for i in np.where(np.asarray(L) > s)[0]]
+        R_left  = R[child_left.left:child_left.right]
+        R_right = R[child_right.left:child_right.right]
+
+        if R_left.size > 0:
+            avg_reach_left = np.mean(R_left)
+        else:
+            avg_reach_left = 0
+        if R_right.size > 0:
+            avg_reach_right = np.mean(R_right)
+        else:
+            avg_reach_right = 0
+
+        if avg_reach_left <= significant_ratio * R[s] >= avg_reach_right:
+            children = []
+            left_size = child_left.right - child_left.left
+            if left_size >= min_cluster_size or left_size == child_left.right:
+                children.append((child_left, L_left))
+            right_size = child_right.right - child_right.left
+            if right_size >= min_cluster_size or right_size == n - child_right.left:
+                children.append((child_right, L_right))
+            if not children:
+                leaves.append(node)
+                return
+
+            if parent and R[s] / R[parent.split] >= similarity_ratio:
+                for child, L in children:
+                    parent.children.append(child)
+                parent.children.remove(node)
+                p = parent
+            else:
+                for child, L in children:
+                    node.children.append(child)
+                p = node
+            for (child, L) in children:
+                cluster_tree(child, p, L)
+        else:
+            cluster_tree(node, parent, L)
+
+    root = Node(0, n)
+    cluster_tree(root, None, L)
+
+    labels = -np.ones(n)
+    for (i, leaf) in enumerate(leaves):
+        for j in xrange(leaf.left, leaf.right):
+            labels[ordering[j]] = i
+
+    return labels
+
+EXTRACTION_FUNCTIONS = {
+    'hierarchical': hierarchical_extraction,
+}
+