8000 adding hierarchical extraction function · espg/scikit-learn@e52cc1e · GitHub
[go: up one dir, main page]

Skip to content

Commit e52cc1e

Browse files
committed
adding hierarchical extraction function
Added hierarchical extraction from scikit-learn#2043 scikit-learn#2043 is BSD licensed and hasn’t had any activity for 10 months, so this seems pretty kosher; authors are cited in the code
1 parent 3dd2d29 commit e52cc1e

File tree

1 file changed

+130
-0
lines changed

1 file changed

+130
-0
lines changed

sklearn/cluster/optics.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# Imports #
1212

1313
import scipy as sp
14+
import numpy as np
1415
from ..utils import check_array
1516
from sklearn.neighbors import BallTree
1617
from sklearn.base import BaseEstimator, ClusterMixin
@@ -284,3 +285,132 @@ def _ExtractDBSCAN(setofobjects, epsilon_prime):
284285
else:
285286
# Zero (i.e., 'False') for non-core, non-noise points #
286287
setofobjects._is_core[entry] = 0
288+
"""
289+
Algorithms for extracting clusters from a reachability plot.
290+
"""
291+
292+
# Author: Fredrik Appelros (fredrik.appelros@gmail.com), Carl Ekerot (kalle@implode.se)
293+
# License: BSD
294+
295+
296+
def hierarchical_extraction(ordering, reachability_distances, min_cluster_size,
297+
significant_ratio=0.75, similarity_ratio=0.4, min_reach_ratio=0.1):
298+
"""
299+
Constructs a tree structure from an OPTICS ordering and a set of
300+
reachability distances and extracts clusters from this structure.
301+
Parameters
302+
----------
303+
ordering : array [n_samples]
304+
Indices of the samples in the order generated by OPTICS.
305+
reachability_distances : array [n_samples]
306+
Reachability distance for each sample.
307+
min_cluster_size : int
308+
The minimum size of a cluster in number of samples.
309+
significant_ratio : float
310+
The ratio for the reachability score of a local maximum
311+
compared to its neighbors to be considered significant.
312+
similarity_ratio : float
313+
The ratio for the reachability score of a split point
314+
compared to the parent split point for it to be considered
315+
similar.
316+
min_reach_ratio : float
317+
The ratio of the largest reachability score that a local
318+
maximum needs to reach in order to be considered.
319+
Returns
320+
-------
321+
labels : array [n_samples]
322+
Cluster labels for each point. Noisy samples are given the label -1.
323+
References
324+
----------
325+
Sander, Jörg, Xuejie Qin, Zhiyong Lu, Nan Niu, and Alex Kovarsky.
326+
"Automatic extraction of clusters from hierarchical clustering
327+
representations." Advances in Knowledge Discovery and Data Mining (2003):
328+
567-567.
329+
"""
330+
R = np.asarray([reachability_distances[i] for i in ordering])
331+
n = len(ordering)
332+
333+
# Find local maximas
334+
L = []
335+
for i in xrange(0, min_cluster_size):
336+
if np.argmax(R[0:i + min_cluster_size + 1]) == i:
337+
L.append(i)
338+
if np.argmax(R[n - 2 * min_cluster_size + i:n]) == i:
339+
L.append(n - min_cluster_size + i)
340+
for i in xrange(min_cluster_size, n - min_cluster_size):
341+
if np.argmax(R[i - min_cluster_size:i + min_cluster_size + 1]) == min_cluster_size:
342+
L.append(i)
343+
# Sort local maximas in order of their reachability
344+
L.sort(key=lambda x: R[x])
345+
R_max = R[L[-1]]
346+
L = filter(lambda x: R[x] >= min_reach_ratio * R_max, L)
347+
348+
class Node:
349+
def __init__(self, left, right):
350+
self.left = left
351+
self.right = right
352+
self.children = []
353+
354+
leaves = []
355+
def cluster_tree(node, parent, L):
356+
if not L:
357+
leaves.append(node)
358+
return
359+
360+
s = node.split = L.pop()
361+
child_left = Node(node.left, s)
362+
child_right = Node(s, node.right)
363+
L_left = [L[i] for i in np.where(np.asarray(L) < s)[0]]
364+
L_right = [L[i] for i in np.where(np.asarray(L) > s)[0]]
365+
R_left = R[child_left.left:child_left.right]
366+
R_right = R[child_right.left:child_right.right]
367+
368+
if R_left.size > 0:
369+
avg_reach_left = np.mean(R_left)
370+
else:
371+
avg_reach_left = 0
372+
if R_right.size > 0:
373+
avg_reach_right = np.mean(R_right)
374+
else:
375+
avg_reach_right = 0
376+
377+
if avg_reach_left <= significant_ratio * R[s] >= avg_reach_right:
378+
children = []
379+
left_size = child_left.right - child_left.left
380+
if left_size >= min_cluster_size or left_size == child_left.right:
381+
children.append((child_left, L_left))
382+
right_size = child_right.right - child_right.left
383+
if right_size >= min_cluster_size or right_size == n - child_right.left:
384+
children.append((child_right, L_right))
385+
if not children:
386+
leaves.append(node)
387+
return
388+
389+
if parent and R[s] / R[parent.split] >= similarity_ratio:
390+
for child, L in children:
391+
parent.children.append(child)
392+
parent.children.remove(node)
393+
p = parent
394+
else:
395+
for child, L in children:
396+
node.children.append(child)
397+
p = node
398+
for (child, L) in children:
399+
cluster_tree(child, p, L)
400+
else:
401+
cluster_tree(node, parent, L)
402+
403+
root = Node(0, n)
404+
cluster_tree(root, None, L)
405+
406+
labels = -np.ones(n)
407+
for (i, leaf) in enumerate(leaves):
408+
for j in xrange(leaf.left, leaf.right):
409+
labels[ordering[j]] = i
410+
411+
return labels
412+
413+
EXTRACTION_FUNCTIONS = {
414+
'hierarchical': hierarchical_extraction,
415+
}
416+

0 commit comments

Comments
 (0)
0