-
-
Notifications
You must be signed in to change notification settings - Fork 26k
[WIP] fixes OPTICS split_points detected as NOISE and last point not detected as outlier. #11857
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7dbea25
a0a2d4e
18d5f9e
6dbea77
53a933e
9728b96
c6dd608
e314b65
3b9cbfe
879178c
d52669b
1505879
67ade15
8ff84fd
078afb3
a2cf913
15d9d7f
29a1f0f
1c683a0
d1ce12d
8bb29cf
748f8c2
c40306f
846979f
e654ea6
7ab9dcd
52edc47
58d278c
9d9a4e4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -276,6 +276,19 @@ class OPTICS(BaseEstimator, ClusterMixin): | |
Distance at which each sample becomes a core point. | ||
Points which will never be core have a distance of inf. | ||
|
||
Examples | ||
-------- | ||
>>> from sklearn.cluster import OPTICS | ||
>>> import numpy as np | ||
>>> np.random.seed(0) | ||
>>> n_points_per_cluster = 3 | ||
>>> C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) | ||
>>> C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) | ||
>>> X = np.vstack((C1, np.array([[100, 200]]), C2)) | ||
>>> clust = OPTICS(min_samples=2).fit(X) | ||
>>> clust.labels_ | ||
array([ 0, 0, 0, -1, 1, 1, 1]) | ||
|
||
See also | ||
-------- | ||
|
||
|
@@ -374,6 +387,7 @@ def fit(self, X, y=None): | |
|
||
indices_, self.labels_ = _extract_optics(self.ordering_, | ||
self.reachability_, | ||
self.core_distances_, | ||
self.maxima_ratio, | ||
self.rejection_ratio, | ||
self.similarity_threshold, | ||
|
@@ -505,7 +519,7 @@ def _extract_dbscan(ordering, core_distances, reachability, eps): | |
return np.arange(n_samples)[is_core], labels | ||
|
||
|
||
def _extract_optics(ordering, reachability, maxima_ratio=.75, | ||
def _extract_optics(ordering, reachability, core_distances, maxima_ratio=.75, | ||
rejection_ratio=.7, similarity_threshold=0.4, | ||
significant_min=.003, min_cluster_size=.005, | ||
min_maxima_ratio=0.001): | ||
|
@@ -519,6 +533,9 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75, | |
reachability : array, shape (n_samples,) | ||
Reachability distances calculated by OPTICS (`reachability_`) | ||
|
||
core_distances : array, shape (n_samples,) | ||
Core distances calculated by OPTICS (`core_distances_`) | ||
|
||
maxima_ratio : float, optional | ||
The maximum ratio we allow of average height of clusters on the | ||
right and left to the local maxima in question. The higher the | ||
|
@@ -563,9 +580,18 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75, | |
""" | ||
|
||
# Extraction wrapper | ||
reachability = reachability / np.max(reachability[1:]) | ||
# according to the paper (p. 5), for a small enough generative distance | ||
# epsilong, there should be more than one INF. | ||
if np.all(np.isinf(reachability)): | ||
raise ValueError("All reachability values are inf. Set a larger" | ||
" max_eps.") | ||
normalization_factor = np.max(reachability[reachability < np.inf]) | ||
reachability = reachability / normalization_factor | ||
reachability_plot = reachability[ordering].tolist() | ||
root_node = _automatic_cluster(reachability_plot, ordering, | ||
core_distances = core_distances / normalization_factor | ||
core_distances_plot = core_distances[ordering].tolist() | ||
root_node = _automatic_cluster(reachability_plot, core_distances_plot, | ||
ordering, | ||
maxima_ratio, rejection_ratio, | ||
similarity_threshold, significant_min, | ||
min_cluster_size, min_maxima_ratio) | ||
8000
|
@@ -581,10 +607,18 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75, | |
labels[index] = clustid | ||
is_core[index] = 1 | ||
clustid += 1 | ||
|
||
# check if the last point in the ordering is a NOISE | ||
last_point = ordering[-1] | ||
if (core_distances_plot[-1] * 5 >= reachability_plot[-1] | ||
and reachability_plot[-1] > reachability_plot[-2] * 5): | ||
labels[last_point] = -1 | ||
is_core[last_point] = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is regarding having an outlier in the last position of |
||
|
||
return np.arange(n_samples)[is_core], labels | ||
|
||
|
||
def _automatic_cluster(reachability_plot, ordering, | ||
def _automatic_cluster(reachability_plot, core_distances_plot, ordering, | ||
maxima_ratio, rejection_ratio, | ||
similarity_threshold, significant_min, | ||
min_cluster_size, min_maxima_ratio): | ||
|
@@ -611,7 +645,8 @@ def _automatic_cluster(reachability_plot, ordering, | |
neighborhood_size) | ||
root_node = _TreeNode(ordering, 0, len(ordering), None) | ||
_cluster_tree(root_node, None, local_maxima_points, | ||
reachability_plot, ordering, min_cluster_size, | ||
reachability_plot, core_distances_plot, | ||
ordering, min_cluster_size, | ||
maxima_ratio, rejection_ratio, | ||
similarity_threshold, significant_min) | ||
|
||
|
@@ -644,7 +679,8 @@ def _find_local_maxima(reachability_plot, neighborhood_size): | |
# if the point is a local maxima on the reachability plot with | ||
# regard to neighborhood_size, insert it into priority queue and | ||
# maxima list | ||
if (reachability_plot[i] > reachability_plot[i - 1] and | ||
# all inf values are local maxima. | ||
if (reachability_plot[i] >= reachability_plot[i - 1] and | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this seems fine to me |
||
reachability_plot[i] >= reachability_plot[i + 1] and | ||
_is_local_maxima(i, np.array(reachability_plot), | ||
neighborhood_size) == 1): | ||
|
@@ -655,7 +691,7 @@ def _find_local_maxima(reachability_plot, neighborhood_size): | |
|
||
|
||
def _cluster_tree(node, parent_node, local_maxima_points, | ||
reachability_plot, reachability_ordering, | ||
reachability_plot, core_distances_plot, ordering, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed |
||
min_cluster_size, maxima_ratio, rejection_ratio, | ||
similarity_threshold, significant_min): | ||
"""Recursively builds cluster tree to hold hierarchical cluster structure | ||
|
@@ -665,7 +701,6 @@ def _cluster_tree(node, parent_node, local_maxima_points, | |
local_maxima_points is list of local maxima points sorted in | ||
descending order of reachability | ||
""" | ||
|
||
if len(local_maxima_points) == 0: | ||
return # parent_node is a leaf | ||
|
||
|
@@ -675,10 +710,13 @@ def _cluster_tree(node, parent_node, local_maxima_points, | |
local_maxima_points = local_maxima_points[1:] | ||
|
||
# create two new nodes and add to list of nodes | ||
node_1 = _TreeNode(reachability_ordering[node.start:s], | ||
node_1 = _TreeNode(ordering[node.start:s], | ||
node.start, s, node) | ||
node_2 = _TreeNode(reachability_ordering[s + 1:node.end], | ||
s + 1, node.end, node) | ||
node_2_start = s + 1 | ||
if core_distances_plot[s] * 5 < reachability_plot[s]: | ||
node_2_start = s | ||
node_2 = _TreeNode(ordering[node_2_start:node.end], | ||
node_2_start, node.end, node) | ||
local_max_1 = [] | ||
local_max_2 = [] | ||
|
||
|
@@ -700,6 +738,8 @@ def _cluster_tree(node, parent_node, local_maxima_points, | |
# only check a certain ratio of points in the child | ||
# nodes formed to the left and right of the maxima | ||
# ...should check_ratio be a user settable parameter? | ||
# NOTE: based on the description of maxima_ratio I'd guess this value | ||
# should be 1, why is it less than 1? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @espg, could you comment here, please? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @espg, we're still not sure why the code doesn't include all the points here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure, but I think it was to allow for outliers (i.e. if you have an inf or split point). The original code was hierarchical, so you could have nested clusters belonging to the same parent cluster. My guess (and it is a guess) is that the check ratio addresses that case for nested clusters. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When I wrote this code, I recall it was because I was dealing with many closely nested candidate clusters, where the higher-level candidates in the cluster tree were being rejected for the lower-level candidates that were more tightly formed. I added this bit so I could tune the algorithm to be more inclusive of slightly further away points in the leaf clusters. For the general purpose of having this in the library, this portion could probably be removed, as I don't think it's part of the original algorithm (@adrinjalali can confirm). This was the only place where I made any additional checks as compared to the paper. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I read the paper yesterday (here for reference), and this is not a part of the main algorithm. However, I wouldn't be opposed to including this as a parameter, with |
||
check_ratio = .8 | ||
check_value_1 = int(np.round(check_ratio * len(node_1.points))) | ||
check_value_2 = int(np.round(check_ratio * len(node_2.points))) | ||
|
@@ -708,22 +748,22 @@ def _cluster_tree(node, parent_node, local_maxima_points, | |
avg_reach2 = np.mean(reachability_plot[node_2.start:(node_2.start | ||
+ check_value_2)]) | ||
|
||
if ((avg_reach1 / reachability_plot[s]) > maxima_ratio or | ||
(avg_reach2 / reachability_plot[s]) > maxima_ratio): | ||
if ((avg_reach1 / maxima_ratio) > reachability_plot[s] or | ||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||
(avg_reach2 / maxima_ratio) > reachability_plot[s]): | ||
|
||
if (avg_reach1 / reachability_plot[s]) < rejection_ratio: | ||
if (avg_reach1 / rejection_ratio) < reachability_plot[s]: | ||
# reject node 2 | ||
node_list.remove((node_2, local_max_2)) | ||
if (avg_reach2 / reachability_plot[s]) < rejection_ratio: | ||
if (avg_reach2 / rejection_ratio) < reachability_plot[s]: | ||
# reject node 1 | ||
node_list.remove((node_1, local_max_1)) | ||
if ((avg_reach1 / reachability_plot[s]) >= rejection_ratio and | ||
(avg_reach2 / reachability_plot[s]) >= rejection_ratio): | ||
if ((avg_reach1 / rejection_ratio) >= reachability_plot[s] and | ||
(avg_reach2 / rejection_ratio) >= reachability_plot[s]): | ||
# since split_point is not significant, | ||
# ignore this split and continue (reject both child nodes) | ||
node.split_point = -1 | ||
_cluster_tree(node, parent_node, local_maxima_points, | ||
reachability_plot, reachability_ordering, | ||
reachability_plot, core_distances_plot, ordering, | ||
min_cluster_size, maxima_ratio, rejection_ratio, | ||
similarity_threshold, significant_min) | ||
return | ||
|
@@ -756,13 +796,13 @@ def _cluster_tree(node, parent_node, local_maxima_points, | |
if bypass_node == 1: | ||
parent_node.children.append(nl[0]) | ||
_cluster_tree(nl[0], parent_node, nl[1], | ||
reachability_plot, reachability_ordering, | ||
reachability_plot, core_distances_plot, ordering, | ||
min_cluster_size, maxima_ratio, rejection_ratio, | ||
similarity_threshold, significant_min) | ||
else: | ||
node.children.append(nl[0]) | ||
_cluster_tree(nl[0], node, nl[1], reachability_plot, | ||
reachability_ordering, min_cluster_size, | ||
core_distances_plot, ordering, min_cluster_size, | ||
maxima_ratio, rejection_ratio, | ||
similarity_threshold, significant_min) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -87,17 +87,27 @@ def test_empty_extract(): | |
|
||
def test_bad_extract(): | ||
# Test an extraction of eps too close to original eps | ||
msg = "Specify an epsilon smaller than 0.015. Got 0.3." | ||
msg = "Specify an epsilon smaller than 0.15. Got 0.3." | ||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||
centers = [[1, 1], [-1, -1], [1, -1]] | ||
X, labels_true = make_blobs(n_samples=750, centers=centers, | ||
cluster_std=0.4, random_state=0) | ||
|
||
# Compute OPTICS | ||
clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10) | ||
clust = OPTICS(max_eps=5.0 * 0.03, min_samples=10) | ||
clust2 = clust.fit(X) | ||
assert_raise_message(ValueError, msg, clust2.extract_dbscan, 0.3) | ||
|
||
|
||
def test_bad_reachability(): | ||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||
msg = "All reachability values are inf. Set a larger max_eps." | ||
centers = [[1, 1], [-1, -1], [1, -1]] | ||
X, labels_true = make_blobs(n_samples=750, centers=centers, | ||
cluster_std=0.4, random_state=0) | ||
|
||
clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10) | ||
assert_raise_message(ValueError, msg, clust.fit, X) | ||
|
||
|
||
def test_close_extract(): | ||
# Test extract where extraction eps is close to scaled epsPrime | ||
|
||
|
@@ -173,18 +183,76 @@ def test_min_cluster_size_invalid(min_cluster_size): | |
clust.fit(X) | ||
|
||
|
||
def test_auto_extract_no_outlier(): | ||
np.random.seed(0) | ||
n_points_per_cluster = 100 | ||
C1 = [-50, -20] + .08 * np.random.randn(n_points_per_cluster, 2) | ||
C2 = [40, -10] + .01 * np.random.randn(n_points_per_cluster, 2) | ||
C3 = [10, -20] + .02 * np.random.randn(n_points_per_cluster, 2) | ||
C4 = [-20, 30] + .03 * np.random.randn(n_points_per_cluster, 2) | ||
C5 = [30, -20] + .06 * np.random.randn(n_points_per_cluster, 2) | ||
C6 = [50, 60] + .02 * np.random.randn(n_points_per_cluster, 2) | ||
X = np.vstack((C1, C2, C3, C4, C5, C6)) | ||
|
||
clust = OPTICS(min_samples=4) | ||
|
||
clust.fit(X) | ||
|
||
# there should be no outliers detected | ||
assert_equal(set(clust.labels_), set([0, 1, 2, 3, 4, 5])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tests if exactly 6 clusters are detected and no NOISE. |
||
|
||
|
||
def test_multi_inf_in_reachability(): | ||
np.random.seed(0) | ||
n_points_per_cluster = 3 | ||
|
||
C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) | ||
C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) | ||
X = np.vstack((C1, np.array([[100, 200], [200, 300]]), C2)) | ||
|
||
clust = OPTICS(min_samples=2, max_eps=2).fit(X) | ||
|
||
assert_array_equal(clust.labels_, np.r_[[0] * 3, -1, -1, [1] * 3]) | ||
assert_array_equal(clust.reachability_[[0, 3, 4, 5]], [np.inf] * 4) | ||
|
||
|
||
def test_auto_extract_outlier(): | ||
np.random.seed(0) | ||
n_points_per_cluster = 4 | ||
|
||
C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) | ||
C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) | ||
X = np.vstack((C1, C2, np.array([[100, 200]]))) | ||
|
||
clust = OPTICS(min_samples=3).fit(X) | ||
|
||
assert_array_equal(clust.labels_, np.r_[[0] * 4, [1] * 4, -1]) | ||
|
||
C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) | ||
C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) | ||
X = np.vstack((C1, np.array([[100, 200], [200, 300]]), C2)) | ||
|
||
clust = OPTICS(min_samples=3).fit(X) | ||
|
||
assert_array_equal(clust.labels_, np.r_[[0] * 4, -1, -1, [1] * 4]) | ||
|
||
|
||
def test_min_cluster_size_invalid2(): | ||
clust = OPTICS(min_cluster_size=len(X) + 1) | ||
with pytest.raises(ValueError, match="must be no greater than the "): | ||
clust.fit(X) | ||
|
||
|
||
@pytest.mark.parametrize("reach, n_child, members", [ | ||
@pytest.mark.parametrize("reach, core_dist, n_child, members", [ | ||
(np.array([np.inf, 0.9, 0.9, 1.0, 0.89, 0.88, 10, .9, .9, .9, 10, 0.9, | ||
0.9, 0.89, 0.88, 10, .9, .9, .9, .9]), 2, np.r_[0:6]), | ||
0.9, 0.89, 0.88, 10, .9, .9, .9, .9]), | ||
np.array([np.inf, 0.9, 0.9, 1.0, 0.89, 0.88, 1, .9, .9, .9, 1, 0.9, | ||
0.9, 0.89, 0.88, 1, .9, .9, .9, .9]), 2, np.r_[0:6]), | ||
(np.array([np.inf, 0.9, 0.9, 0.9, 0.89, 0.88, 10, .9, .9, .9, 10, 0.9, | ||
0.9, 0.89, 0.88, 100, .9, .9, .9, .9]), 1, np.r_[0:15])]) | ||
def test_cluster_sigmin_pruning(reach, n_child, members): | ||
0.9, 0.89, 0.88, 100, .9, .9, .9, .9]), | ||
np.array([np.inf, 0.9, 0.9, 0.9, 0.89, 0.88, 1, .9, .9, .9, 1, 0.9, | ||
0.9, 0.89, 0.88, 1, .9, .9, .9, .9]), 1, np.r_[0:15])]) | ||
def test_cluster_sigmin_pruning(reach, core_dist, n_child, members): | ||
# Tests pruning left and right, insignificant splitpoints, empty nodelists | ||
# Parameters chosen specifically for this task | ||
|
||
|
@@ -198,7 +266,7 @@ def test_cluster_sigmin_pruning(reach, n_child, members): | |
root = _TreeNode(ordering, 0, 20, None) | ||
|
||
# Build cluster tree inplace on root node | ||
_cluster_tree(root, None, cluster_boundaries, reach, ordering, | ||
_cluster_tree(root, None, cluster_boundaries, reach, core_dist, ordering, | ||
5, .75, .7, .4, .3) | ||
assert_equal(root.split_point, cluster_boundaries[0]) | ||
assert_equal(n_child, len(root.children)) | ||
|
Uh oh!
There was an error while loading. Please reload this page.