From 46cd69d9bbe159edfe83a41614cfae184e751842 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Sep 2018 10:08:19 +0200 Subject: [PATCH 1/5] add Xi steep method for split points --- sklearn/cluster/optics_.py | 45 +++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 899da518ae796..ba5e4dcbea32f 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -25,7 +25,7 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size=.005, - min_maxima_ratio=0.001, algorithm='ball_tree', + min_maxima_ratio=0.001, xi=0.8, algorithm='ball_tree', leaf_size=30, n_jobs=None): """Perform OPTICS clustering from vector array @@ -153,7 +153,7 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', clust = OPTICS(min_samples, max_eps, metric, p, metric_params, maxima_ratio, rejection_ratio, similarity_threshold, significant_min, - min_cluster_size, min_maxima_ratio, + min_cluster_size, min_maxima_ratio, xi, algorithm, leaf_size, n_jobs) clust.fit(X) return clust.core_sample_indices_, clust.labels_ @@ -294,7 +294,7 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size=.005, - min_maxima_ratio=0.001, algorithm='ball_tree', + min_maxima_ratio=0.001, xi=.8, algorithm='ball_tree', leaf_size=30, n_jobs=None): self.max_eps = max_eps @@ -305,6 +305,7 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', self.significant_min = significant_min self.min_cluster_size = min_cluster_size self.min_maxima_ratio = min_maxima_ratio + self.xi = xi self.algorithm = algorithm self.metric = metric self.metric_params = metric_params @@ -379,7 +380,8 @@ def fit(self, X, y=None): self.similarity_threshold, self.significant_min, self.min_cluster_size, - self.min_maxima_ratio) + self.min_maxima_ratio, + self.xi) self.core_sample_indices_ = indices_ return self @@ -508,7 +510,7 @@ def _extract_dbscan(ordering, core_distances, reachability, eps): def _extract_optics(ordering, reachability, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size=.005, - min_maxima_ratio=0.001): + min_maxima_ratio=0.001, xi=0.8): """Performs automatic cluster extraction for variable density data. Parameters @@ -568,7 +570,7 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75, root_node = _automatic_cluster(reachability_plot, ordering, maxima_ratio, rejection_ratio, similarity_threshold, significant_min, - min_cluster_size, min_maxima_ratio) + min_cluster_size, min_maxima_ratio, xi) leaves = _get_leaves(root_node, []) # Start cluster id's at 0 clustid = 0 @@ -581,13 +583,20 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75, labels[index] = clustid is_core[index] = 1 clustid += 1 + + # check if the last point is xi-steep upward + last_point = ordering[-1] + if reachability_plot[-2] <= reachability_plot[-1] * (1 - xi): + labels[last_point] = -1 + is_core[last_point] = 0 + return np.arange(n_samples)[is_core], labels def _automatic_cluster(reachability_plot, ordering, maxima_ratio, rejection_ratio, similarity_threshold, significant_min, - min_cluster_size, min_maxima_ratio): + min_cluster_size, min_maxima_ratio, xi): """Converts reachability plot to cluster tree and returns root node. Parameters @@ -613,7 +622,7 @@ def _automatic_cluster(reachability_plot, ordering, _cluster_tree(root_node, None, local_maxima_points, reachability_plot, ordering, min_cluster_size, maxima_ratio, rejection_ratio, - similarity_threshold, significant_min) + similarity_threshold, significant_min, xi) return root_node @@ -657,7 +666,7 @@ def _find_local_maxima(reachability_plot, neighborhood_size): def _cluster_tree(node, parent_node, local_maxima_points, reachability_plot, reachability_ordering, min_cluster_size, maxima_ratio, rejection_ratio, - similarity_threshold, significant_min): + similarity_threshold, significant_min, xi): """Recursively builds cluster tree to hold hierarchical cluster structure node is a node or the root of the tree in the first call @@ -677,15 +686,21 @@ def _cluster_tree(node, parent_node, local_maxima_points, # create two new nodes and add to list of nodes node_1 = _TreeNode(reachability_ordering[node.start:s], node.start, s, node) - node_2 = _TreeNode(reachability_ordering[s + 1:node.end], - s + 1, node.end, node) + + # check if s is xi-steep downward + if reachability_plot[s] * (1 - xi) >= reachability_plot[s + 1]: + node_2_start = s + else: + node_2_start = s + 1 + node_2 = _TreeNode(reachability_ordering[node_2_start:node.end], + node_2_start, node.end, node) local_max_1 = [] local_max_2 = [] for i in local_maxima_points: if i < s: local_max_1.append(i) - if i > s: + if i >= node_2_start: local_max_2.append(i) node_list = [] @@ -725,7 +740,7 @@ def _cluster_tree(node, parent_node, local_maxima_points, _cluster_tree(node, parent_node, local_maxima_points, reachability_plot, reachability_ordering, min_cluster_size, maxima_ratio, rejection_ratio, - similarity_threshold, significant_min) + similarity_threshold, significant_min, xi) return # remove clusters that are too small @@ -758,13 +773,13 @@ def _cluster_tree(node, parent_node, local_maxima_points, _cluster_tree(nl[0], parent_node, nl[1], reachability_plot, reachability_ordering, min_cluster_size, maxima_ratio, rejection_ratio, - similarity_threshold, significant_min) + similarity_threshold, significant_min, xi) else: node.children.append(nl[0]) _cluster_tree(nl[0], node, nl[1], reachability_plot, reachability_ordering, min_cluster_size, maxima_ratio, rejection_ratio, - similarity_threshold, significant_min) + similarity_threshold, significant_min, xi) def _get_leaves(node, arr): From 960ddcf01ffa29e851e1f24261723b393b0fdc52 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Sep 2018 10:14:33 +0200 Subject: [PATCH 2/5] add the test --- sklearn/cluster/tests/test_optics.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index bddf57ec7b5d1..4a35977f4645e 100755 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -179,6 +179,26 @@ def test_min_cluster_size_invalid2(): clust.fit(X) +def test_auto_extract_outlier(): + np.random.seed(0) + + n_points_per_cluster = 4 + + C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) + C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) + X = np.vstack((C1, C2, np.array([[100, 200]]))) + clust = OPTICS(min_samples=3).fit(X) + + assert_array_equal(clust.labels_, np.r_[[0] * 4, [1] * 4, -1]) + + C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) + C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) + X = np.vstack((C1, np.array([[100, 200], [200, 300]]), C2)) + clust = OPTICS(min_samples=3).fit(X) + + assert_array_equal(clust.labels_, np.r_[[0] * 4, -1, -1, [1] * 4]) + + @pytest.mark.parametrize("reach, n_child, members", [ (np.array([np.inf, 0.9, 0.9, 1.0, 0.89, 0.88, 10, .9, .9, .9, 10, 0.9, 0.9, 0.89, 0.88, 10, .9, .9, .9, .9]), 2, np.r_[0:6]), @@ -199,7 +219,7 @@ def test_cluster_sigmin_pruning(reach, n_child, members): # Build cluster tree inplace on root node _cluster_tree(root, None, cluster_boundaries, reach, ordering, - 5, .75, .7, .4, .3) + 5, .75, .7, .4, .3, 1) assert_equal(root.split_point, cluster_boundaries[0]) assert_equal(n_child, len(root.children)) assert_array_equal(members, root.children[0].points) From e8d3c935414a57e7b70dbc65a1df8a111bbda20a Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Sep 2018 10:43:37 +0200 Subject: [PATCH 3/5] fix last point issue --- sklearn/cluster/optics_.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index ba5e4dcbea32f..91b9ba1502278 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -25,7 +25,7 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size=.005, - min_maxima_ratio=0.001, xi=0.8, algorithm='ball_tree', + min_maxima_ratio=0.001, xi=0.9, algorithm='ball_tree', leaf_size=30, n_jobs=None): """Perform OPTICS clustering from vector array @@ -294,7 +294,7 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size=.005, - min_maxima_ratio=0.001, xi=.8, algorithm='ball_tree', + min_maxima_ratio=0.001, xi=.9, algorithm='ball_tree', leaf_size=30, n_jobs=None): self.max_eps = max_eps @@ -510,7 +510,7 @@ def _extract_dbscan(ordering, core_distances, reachability, eps): def _extract_optics(ordering, reachability, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size=.005, - min_maxima_ratio=0.001, xi=0.8): + min_maxima_ratio=0.001, xi=0.9): """Performs automatic cluster extraction for variable density data. Parameters @@ -584,12 +584,6 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75, is_core[index] = 1 clustid += 1 - # check if the last point is xi-steep upward - last_point = ordering[-1] - if reachability_plot[-2] <= reachability_plot[-1] * (1 - xi): - labels[last_point] = -1 - is_core[last_point] = 0 - return np.arange(n_samples)[is_core], labels @@ -692,8 +686,15 @@ def _cluster_tree(node, parent_node, local_maxima_points, node_2_start = s else: node_2_start = s + 1 - node_2 = _TreeNode(reachability_ordering[node_2_start:node.end], - node_2_start, node.end, node) + + # check if the last point is xi-steep upward + node_2_end = node.end + if (reachability_plot[node.end - 1] * (1 - xi) + >= reachability_plot[node.end - 2]): + node_2_end = node.end - 1 + + node_2 = _TreeNode(reachability_ordering[node_2_start:node_2_end], + node_2_start, node_2_end, node) local_max_1 = [] local_max_2 = [] From 62c4ef88fd730403ff150005b1d1bd624e404a64 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Sep 2018 10:52:18 +0200 Subject: [PATCH 4/5] add docstring for xi --- sklearn/cluster/optics_.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 91b9ba1502278..f37ac41ef593f 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -103,6 +103,11 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', Each local maxima should be a largest value in a neighborhood of the `size min_maxima_ratio * len(X)` from left and right. + xi : float between 0 and 1, optional (default=.9) + Defines the steepness used to include/explude a split point and + the last point of a split in that cluster. Setting `xi` to 1 + would always exclude those split points. + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: @@ -233,6 +238,11 @@ class OPTICS(BaseEstimator, ClusterMixin): Each local maxima should be a largest value in a neighborhood of the `size min_maxima_ratio * len(X)` from left and right. + xi : float between 0 and 1, optional (default=.9) + Defines the steepness used to include/explude a split point and + the last point of a split in that cluster. Setting `xi` to 1 + would always exclude those split points. + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: From 1957020039d67da1282a4f4a92009cdb37762147 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Sep 2018 11:47:47 +0200 Subject: [PATCH 5/5] pep8 --- sklearn/cluster/optics_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index f37ac41ef593f..5c5157d9ccf6f 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -106,7 +106,7 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', xi : float between 0 and 1, optional (default=.9) Defines the steepness used to include/explude a split point and the last point of a split in that cluster. Setting `xi` to 1 - would always exclude those split points. + would always exclude those split points. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: @@ -700,7 +700,7 @@ def _cluster_tree(node, parent_node, local_maxima_points, # check if the last point is xi-steep upward node_2_end = node.end if (reachability_plot[node.end - 1] * (1 - xi) - >= reachability_plot[node.end - 2]): + >= reachability_plot[node.end - 2]): node_2_end = node.end - 1 node_2 = _TreeNode(reachability_ordering[node_2_start:node_2_end],