8000 [WIP] fixes OPTICS split_points detected as NOISE and last point not detected as outlier. by adrinjalali · Pull Request #11857 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] fixes OPTICS split_points detected as NOISE and last point not detected as outlier. #11857

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
7dbea25
debugging optics
Jul 26, 2018
a0a2d4e
heavily debugging
Aug 19, 2018
18d5f9e
a threshold based solution
Aug 19, 2018
6dbea77
attempting to fix last data
Aug 19, 2018
53a933e
seems closer to a solution
Aug 19, 2018
9728b96
clean up
Aug 19, 2018
c6dd608
fix indent
Aug 20, 2018
e314b65
fix optics tests
Aug 20, 2018
3b9cbfe
fix PEP8 and add comment
Aug 20, 2018
879178c
add tests and fix last point bug
Aug 20, 2018
d52669b
add example and comments
Aug 21, 2018
1505879
fix inf in np.max line
Aug 21, 2018
67ade15
fix more inf issues in reachability
Aug 21, 2018
8000
8ff84fd
add default values to docstring
Aug 21, 2018
078afb3
comment out irrelevant _cluster_tree call
Aug 21, 2018
a2cf913
test multiple inf in reachability, and the fix
Aug 21, 2018
15d9d7f
reachability_ordering -> ordering
Aug 21, 2018
29a1f0f
more docs
Aug 23, 2018
1c683a0
spaces around *
adrinjalali Aug 31, 2018
d1ce12d
cleanup
adrinjalali Sep 2, 2018
8bb29cf
rename max_bound to max_eps
adrinjalali Sep 3, 2018
748f8c2
rename max_bound to max_eps in tests and docs
adrinjalali Sep 3, 2018
c40306f
make class and function's docstring consistent
adrinjalali Sep 3, 2018
846979f
fix PEP issue in comment
adrinjalali Sep 3, 2018
e654ea6
Merge branch 'master' into bug/optics
adrinjalali Sep 3, 2018
7ab9dcd
Merge remote-tracking branch 'upstream/master' into bug/optics
adrinjalali Sep 4, 2018
52edc47
remove changes addressed in other PRs
adrinjalali Sep 6, 2018
58d278c
merge master
adrinjalali Sep 8, 2018
9d9a4e4
Merge branch 'master' into bug/optics
adrinjalali Sep 9, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 60 additions & 20 deletions sklearn/cluster/optics_.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,19 @@ class OPTICS(BaseEstimator, ClusterMixin):
Distance at which each sample becomes a core point.
Points which will never be core have a distance of inf.

Examples
--------
>>> from sklearn.cluster import OPTICS
>>> import numpy as np
>>> np.random.seed(0)
>>> n_points_per_cluster = 3
>>> C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
>>> C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
>>> X = np.vstack((C1, np.array([[100, 200]]), C2))
>>> clust = OPTICS(min_samples=2).fit(X)
>>> clust.labels_
array([ 0, 0, 0, -1, 1, 1, 1])

See also
--------

Expand Down Expand Up @@ -374,6 +387,7 @@ def fit(self, X, y=None):

indices_, self.labels_ = _extract_optics(self.ordering_,
self.reachability_,
self.core_distances_,
self.maxima_ratio,
self.rejection_ratio,
self.similarity_threshold,
Expand Down Expand Up @@ -505,7 +519,7 @@ def _extract_dbscan(ordering, core_distances, reachability, eps):
return np.arange(n_samples)[is_core], labels


def _extract_optics(ordering, reachability, maxima_ratio=.75,
def _extract_optics(ordering, reachability, core_distances, maxima_ratio=.75,
rejection_ratio=.7, similarity_threshold=0.4,
significant_min=.003, min_cluster_size=.005,
min_maxima_ratio=0.001):
Expand All @@ -519,6 +533,9 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75,
reachability : array, shape (n_samples,)
Reachability distances calculated by OPTICS (`reachability_`)

core_distances : array, shape (n_samples,)
Core distances calculated by OPTICS (`core_distances_`)

maxima_ratio : float, optional
The maximum ratio we allow of average height of clusters on the
right and left to the local maxima in question. The higher the
Expand Down Expand Up @@ -563,9 +580,18 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75,
"""

# Extraction wrapper
reachability = reachability / np.max(reachability[1:])
# according to the paper (p. 5), for a small enough generative distance
# epsilong, there should be more than one INF.
if np.all(np.isinf(reachability)):
raise ValueError("All reachability values are inf. Set a larger"
" max_eps.")
normalization_factor = np.max(reachability[reachability < np.inf])
reachability = reachability / normalization_factor
reachability_plot = reachability[ordering].tolist()
root_node = _automatic_cluster(reachability_plot, ordering,
core_distances = core_distances / normalization_factor
core_distances_plot = core_distances[ordering].tolist()
root_node = _automatic_cluster(reachability_plot, core_distances_plot,
ordering,
maxima_ratio, rejection_ratio,
similarity_threshold, significant_min,
min_cluster_size, min_maxima_ratio)
Expand All @@ -581,10 +607,18 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75,
labels[index] = clustid
is_core[index] = 1
clustid += 1

# check if the last point in the ordering is a NOISE
last_point = ordering[-1]
if (core_distances_plot[-1] * 5 >= reachability_plot[-1]
and reachability_plot[-1] > reachability_plot[-2] * 5):
labels[last_point] = -1
is_core[last_point] = 0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is regarding having an outlier in the last position of ordering_. This checks for that the same way it checks if a split point is an outlier, since it would be a split point if it wasn't the last point in the ordering_.


return np.arange(n_samples)[is_core], labels


def _automatic_cluster(reachability_plot, ordering,
def _automatic_cluster(reachability_plot, core_distances_plot, ordering,
maxima_ratio, rejection_ratio,
similarity_threshold, significant_min,
min_cluster_size, min_maxima_ratio):
Expand All @@ -611,7 +645,8 @@ def _automatic_cluster(reachability_plot, ordering,
neighborhood_size)
root_node = _TreeNode(ordering, 0, len(ordering), None)
_cluster_tree(root_node, None, local_maxima_points,
reachability_plot, ordering, min_cluster_size,
reachability_plot, core_distances_plot,
ordering, min_cluster_size,
maxima_ratio, rejection_ratio,
similarity_threshold, significant_min)

Expand Down Expand Up @@ -644,7 +679,8 @@ def _find_local_maxima(reachability_plot, neighborhood_size):
# if the point is a local maxima on the reachability plot with
# regard to neighborhood_size, insert it into priority queue and
# maxima list
if (reachability_plot[i] > reachability_plot[i - 1] and
# all inf values are local maxima.
if (reachability_plot[i] >= reachability_plot[i - 1] and
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems fine to me

reachability_plot[i] >= reachability_plot[i + 1] and
_is_local_maxima(i, np.array(reachability_plot),
neighborhood_size) == 1):
Expand All @@ -655,7 +691,7 @@ def _find_local_maxima(reachability_plot, neighborhood_size):


def _cluster_tree(node, parent_node, local_maxima_points,
reachability_plot, reachability_ordering,
reachability_plot, core_distances_plot, ordering,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed reachability_ordering to ordering since the same ordering applies (and is calculated) for both arrays (reachability and core_distances).

min_cluster_size, maxima_ratio, rejection_ratio,
similarity_threshold, significant_min):
"""Recursively builds cluster tree to hold hierarchical cluster structure
Expand All @@ -665,7 +701,6 @@ def _cluster_tree(node, parent_node, local_maxima_points,
local_maxima_points is list of local maxima points sorted in
descending order of reachability
"""

if len(local_maxima_points) == 0:
return # parent_node is a leaf

Expand All @@ -675,10 +710,13 @@ def _cluster_tree(node, parent_node, local_maxima_points,
local_maxima_points = local_maxima_points[1:]

# create two new nodes and add to list of nodes
node_1 = _TreeNode(reachability_ordering[node.start:s],
node_1 = _TreeNode(ordering[node.start:s],
node.start, s, node)
node_2 = _TreeNode(reachability_ordering[s + 1:node.end],
s + 1, node.end, node)
node_2_start = s + 1
if core_distances_plot[s] * 5 < reachability_plot[s]:
node_2_start = s
node_2 = _TreeNode(ordering[node_2_start:node.end],
node_2_start, node.end, node)
local_max_1 = []
local_max_2 = []

Expand All @@ -700,6 +738,8 @@ def _cluster_tree(node, parent_node, local_maxima_points,
# only check a certain ratio of points in the child
# nodes formed to the left and right of the maxima
# ...should check_ratio be a user settable parameter?
# NOTE: based on the description of maxima_ratio I'd guess this value
# should be 1, why is it less than 1?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@espg, could you comment here, please?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@espg, we're still not sure why the code doesn't include all the points here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure, but I think it was to allow for outliers (i.e. if you have an inf or split point). The original code was hierarchical, so you could have nested clusters belonging to the same parent cluster. My guess (and it is a guess) is that the check ratio addresses that case for nested clusters.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I wrote this code, I recall it was because I was dealing with many closely nested candidate clusters, where the higher-level candidates in the cluster tree were being rejected for the lower-level candidates that were more tightly formed. I added this bit so I could tune the algorithm to be more inclusive of slightly further away points in the leaf clusters.

For the general purpose of having this in the library, this portion could probably be removed, as I don't think it's part of the original algorithm (@adrinjalali can confirm). This was the only place where I made any additional checks as compared to the paper.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I read the paper yesterday (here for reference), and this is not a part of the main algorithm. However, I wouldn't be opposed to including this as a parameter, with 1 as the default value as is in the paper. We may need to alter the algorithm a bit for the split points issue anyway, and I know the other implementations (at least in R, although not of this algorithm, but the original OPTICS), don't strictly follow the publication and they may include some additions. But that's a different issue and I'm still working on how to articulate my thoughts on it the best I can.

check_ratio = .8
check_value_1 = int(np.round(check_ratio * len(node_1.points)))
check_value_2 = int(np.round(check_ratio * len(node_2.points)))
Expand All @@ -708,22 +748,22 @@ def _cluster_tree(node, parent_node, local_maxima_points,
avg_reach2 = np.mean(reachability_plot[node_2.start:(node_2.start
+ check_value_2)])

if ((avg_reach1 / reachability_plot[s]) > maxima_ratio or
(avg_reach2 / reachability_plot[s]) > maxima_ratio):
if ((avg_reach1 / maxima_ratio) > reachability_plot[s] or
(avg_reach2 / maxima_ratio) > reachability_plot[s]):

if (avg_reach1 / reachability_plot[s]) < rejection_ratio:
if (avg_reach1 / rejection_ratio) < reachability_plot[s]:
# reject node 2
node_list.remove((node_2, local_max_2))
if (avg_reach2 / reachability_plot[s]) < rejection_ratio:
if (avg_reach2 / rejection_ratio) < reachability_plot[s]:
# reject node 1
node_list.remove((node_1, local_max_1))
if ((avg_reach1 / reachability_plot[s]) >= rejection_ratio and
(avg_reach2 / reachability_plot[s]) >= rejection_ratio):
if ((avg_reach1 / rejection_ratio) >= reachability_plot[s] and
(avg_reach2 / rejection_ratio) >= reachability_plot[s]):
# since split_point is not significant,
# ignore this split and continue (reject both child nodes)
node.split_point = -1
_cluster_tree(node, parent_node, local_maxima_points,
reachability_plot, reachability_ordering,
reachability_plot, core_distances_plot, ordering,
min_cluster_size, maxima_ratio, rejection_ratio,
similarity_threshold, significant_min)
return
Expand Down Expand Up @@ -756,13 +796,13 @@ def _cluster_tree(node, parent_node, local_maxima_points,
if bypass_node == 1:
parent_node.children.append(nl[0])
_cluster_tree(nl[0], parent_node, nl[1],
reachability_plot, reachability_ordering,
reachability_plot, core_distances_plot, ordering,
min_cluster_size, maxima_ratio, rejection_ratio,
similarity_threshold, significant_min)
else:
node.children.append(nl[0])
_cluster_tree(nl[0], node, nl[1], reachability_plot,
reachability_ordering, min_cluster_size,
core_distances_plot, ordering, min_cluster_size,
maxima_ratio, rejection_ratio,
similarity_threshold, significant_min)

Expand Down
82 changes: 75 additions & 7 deletions sklearn/cluster/tests/test_optics.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,27 @@ def test_empty_extract():

def test_bad_extract():
# Test an extraction of eps too close to original eps
msg = "Specify an epsilon smaller than 0.015. Got 0.3."
msg = "Specify an epsilon smaller than 0.15. Got 0.3."
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)

# Compute OPTICS
clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10)
clust = OPTICS(max_eps=5.0 * 0.03, min_samples=10)
clust2 = clust.fit(X)
assert_raise_message(ValueError, msg, clust2.extract_dbscan, 0.3)


def test_bad_reachability():
msg = "All reachability values are inf. Set a larger max_eps."
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)

clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10)
assert_raise_message(ValueError, msg, clust.fit, X)


def test_close_extract():
# Test extract where extraction eps is close to scaled epsPrime

Expand Down Expand Up @@ -173,18 +183,76 @@ def test_min_cluster_size_invalid(min_cluster_size):
clust.fit(X)


def test_auto_extract_no_outlier():
np.random.seed(0)
n_points_per_cluster = 100
C1 = [-50, -20] + .08 * np.random.randn(n_points_per_cluster, 2)
C2 = [40, -10] + .01 * np.random.randn(n_points_per_cluster, 2)
C3 = [10, -20] + .02 * np.random.randn(n_points_per_cluster, 2)
C4 = [-20, 30] + .03 * np.random.randn(n_points_per_cluster, 2)
C5 = [30, -20] + .06 * np.random.randn(n_points_per_cluster, 2)
C6 = [50, 60] + .02 * np.random.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))

clust = OPTICS(min_samples=4)

clust.fit(X)

# there should be no outliers detected
assert_equal(set(clust.labels_), set([0, 1, 2, 3, 4, 5]))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tests if exactly 6 clusters are detected and no NOISE.



def test_multi_inf_in_reachability():
np.random.seed(0)
n_points_per_cluster = 3

C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
X = np.vstack((C1, np.array([[100, 200], [200, 300]]), C2))

clust = OPTICS(min_samples=2, max_eps=2).fit(X)

assert_array_equal(clust.labels_, np.r_[[0] * 3, -1, -1, [1] * 3])
assert_array_equal(clust.reachability_[[0, 3, 4, 5]], [np.inf] * 4)


def test_auto_extract_outlier():
np.random.seed(0)
n_points_per_cluster = 4

C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, np.array([[100, 200]])))

clust = OPTICS(min_samples=3).fit(X)

assert_array_equal(clust.labels_, np.r_[[0] * 4, [1] * 4, -1])

C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
X = np.vstack((C1, np.array([[100, 200], [200, 300]]), C2))

clust = OPTICS(min_samples=3).fit(X)

assert_array_equal(clust.labels_, np.r_[[0] * 4, -1, -1, [1] * 4])


def test_min_cluster_size_invalid2():
clust = OPTICS(min_cluster_size=len(X) + 1)
with pytest.raises(ValueError, match="must be no greater than the "):
clust.fit(X)


@pytest.mark.parametrize("reach, n_child, members", [
@pytest.mark.parametrize("reach, core_dist, n_child, members", [
(np.array([np.inf, 0.9, 0.9, 1.0, 0.89, 0.88, 10, .9, .9, .9, 10, 0.9,
0.9, 0.89, 0.88, 10, .9, .9, .9, .9]), 2, np.r_[0:6]),
0.9, 0.89, 0.88, 10, .9, .9, .9, .9]),
np.array([np.inf, 0.9, 0.9, 1.0, 0.89, 0.88, 1, .9, .9, .9, 1, 0.9,
0.9, 0.89, 0.88, 1, .9, .9, .9, .9]), 2, np.r_[0:6]),
(np.array([np.inf, 0.9, 0.9, 0.9, 0.89, 0.88, 10, .9, .9, .9, 10, 0.9,
0.9, 0.89, 0.88, 100, .9, .9, .9, .9]), 1, np.r_[0:15])])
def test_cluster_sigmin_pruning(reach, n_child, members):
0.9, 0.89, 0.88, 100, .9, .9, .9, .9]),
np.array([np.inf, 0.9, 0.9, 0.9, 0.89, 0.88, 1, .9, .9, .9, 1, 0.9,
0.9, 0.89, 0.88, 1, .9, .9, .9, .9]), 1, np.r_[0:15])])
def test_cluster_sigmin_pruning(reach, core_dist, n_child, members):
# Tests pruning left and right, insignificant splitpoints, empty nodelists
# Parameters chosen specifically for this task

Expand All @@ -198,7 +266,7 @@ def test_cluster_sigmin_pruning(reach, n_child, members):
root = _TreeNode(ordering, 0, 20, None)

# Build cluster tree inplace on root node
_cluster_tree(root, None, cluster_boundaries, reach, ordering,
_cluster_tree(root, None, cluster_boundaries, reach, core_dist, ordering,
5, .75, .7, .4, .3)
assert_equal(root.split_point, cluster_boundaries[0])
assert_equal(n_child, len(root.children))
Expand Down
0