diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 899da518ae796..5c5157d9ccf6f 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -25,7 +25,7 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size=.005, - min_maxima_ratio=0.001, algorithm='ball_tree', + min_maxima_ratio=0.001, xi=0.9, algorithm='ball_tree', leaf_size=30, n_jobs=None): """Perform OPTICS clustering from vector array @@ -103,6 +103,11 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', Each local maxima should be a largest value in a neighborhood of the `size min_maxima_ratio * len(X)` from left and right. + xi : float between 0 and 1, optional (default=.9) + Defines the steepness used to include/explude a split point and + the last point of a split in that cluster. Setting `xi` to 1 + would always exclude those split points. + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: @@ -153,7 +158,7 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', clust = OPTICS(min_samples, max_eps, metric, p, metric_params, maxima_ratio, rejection_ratio, similarity_threshold, significant_min, - min_cluster_size, min_maxima_ratio, + min_cluster_size, min_maxima_ratio, xi, algorithm, leaf_size, n_jobs) clust.fit(X) return clust.core_sample_indices_, clust.labels_ @@ -233,6 +238,11 @@ class OPTICS(BaseEstimator, ClusterMixin): Each local maxima should be a largest value in a neighborhood of the `size min_maxima_ratio * len(X)` from left and right. + xi : float between 0 and 1, optional (default=.9) + Defines the steepness used to include/explude a split point and + the last point of a split in that cluster. Setting `xi` to 1 + would always exclude those split points. + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: @@ -294,7 +304,7 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size=.005, - min_maxima_ratio=0.001, algorithm='ball_tree', + min_maxima_ratio=0.001, xi=.9, algorithm='ball_tree', leaf_size=30, n_jobs=None): self.max_eps = max_eps @@ -305,6 +315,7 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', self.significant_min = significant_min self.min_cluster_size = min_cluster_size self.min_maxima_ratio = min_maxima_ratio + self.xi = xi self.algorithm = algorithm self.metric = metric self.metric_params = metric_params @@ -379,7 +390,8 @@ def fit(self, X, y=None): self.similarity_threshold, self.significant_min, self.min_cluster_size, - self.min_maxima_ratio) + self.min_maxima_ratio, + self.xi) self.core_sample_indices_ = indices_ return self @@ -508,7 +520,7 @@ def _extract_dbscan(ordering, core_distances, reachability, eps): def _extract_optics(ordering, reachability, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size=.005, - min_maxima_ratio=0.001): + min_maxima_ratio=0.001, xi=0.9): """Performs automatic cluster extraction for variable density data. Parameters @@ -568,7 +580,7 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75, root_node = _automatic_cluster(reachability_plot, ordering, maxima_ratio, rejection_ratio, similarity_threshold, significant_min, - min_cluster_size, min_maxima_ratio) + min_cluster_size, min_maxima_ratio, xi) leaves = _get_leaves(root_node, []) # Start cluster id's at 0 clustid = 0 @@ -581,13 +593,14 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75, labels[index] = clustid is_core[index] = 1 clustid += 1 + return np.arange(n_samples)[is_core], labels def _automatic_cluster(reachability_plot, ordering, maxima_ratio, rejection_ratio, similarity_threshold, significant_min, - min_cluster_size, min_maxima_ratio): + min_cluster_size, min_maxima_ratio, xi): """Converts reachability plot to cluster tree and returns root node. Parameters @@ -613,7 +626,7 @@ def _automatic_cluster(reachability_plot, ordering, _cluster_tree(root_node, None, local_maxima_points, reachability_plot, ordering, min_cluster_size, maxima_ratio, rejection_ratio, - similarity_threshold, significant_min) + similarity_threshold, significant_min, xi) return root_node @@ -657,7 +670,7 @@ def _find_local_maxima(reachability_plot, neighborhood_size): def _cluster_tree(node, parent_node, local_maxima_points, reachability_plot, reachability_ordering, min_cluster_size, maxima_ratio, rejection_ratio, - similarity_threshold, significant_min): + similarity_threshold, significant_min, xi): """Recursively builds cluster tree to hold hierarchical cluster structure node is a node or the root of the tree in the first call @@ -677,15 +690,28 @@ def _cluster_tree(node, parent_node, local_maxima_points, # create two new nodes and add to list of nodes node_1 = _TreeNode(reachability_ordering[node.start:s], node.start, s, node) - node_2 = _TreeNode(reachability_ordering[s + 1:node.end], - s + 1, node.end, node) + + # check if s is xi-steep downward + if reachability_plot[s] * (1 - xi) >= reachability_plot[s + 1]: + node_2_start = s + else: + node_2_start = s + 1 + + # check if the last point is xi-steep upward + node_2_end = node.end + if (reachability_plot[node.end - 1] * (1 - xi) + >= reachability_plot[node.end - 2]): + node_2_end = node.end - 1 + + node_2 = _TreeNode(reachability_ordering[node_2_start:node_2_end], + node_2_start, node_2_end, node) local_max_1 = [] local_max_2 = [] for i in local_maxima_points: if i < s: local_max_1.append(i) - if i > s: + if i >= node_2_start: local_max_2.append(i) node_list = [] @@ -725,7 +751,7 @@ def _cluster_tree(node, parent_node, local_maxima_points, _cluster_tree(node, parent_node, local_maxima_points, reachability_plot, reachability_ordering, min_cluster_size, maxima_ratio, rejection_ratio, - similarity_threshold, significant_min) + similarity_threshold, significant_min, xi) return # remove clusters that are too small @@ -758,13 +784,13 @@ def _cluster_tree(node, parent_node, local_maxima_points, _cluster_tree(nl[0], parent_node, nl[1], reachability_plot, reachability_ordering, min_cluster_size, maxima_ratio, rejection_ratio, - similarity_threshold, significant_min) + similarity_threshold, significant_min, xi) else: node.children.append(nl[0]) _cluster_tree(nl[0], node, nl[1], reachability_plot, reachability_ordering, min_cluster_size, maxima_ratio, rejection_ratio, - similarity_threshold, significant_min) + similarity_threshold, significant_min, xi) def _get_leaves(node, arr): diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index bddf57ec7b5d1..4a35977f4645e 100755 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -179,6 +179,26 @@ def test_min_cluster_size_invalid2(): clust.fit(X) +def test_auto_extract_outlier(): + np.random.seed(0) + + n_points_per_cluster = 4 + + C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) + C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) + X = np.vstack((C1, C2, np.array([[100, 200]]))) + clust = OPTICS(min_samples=3).fit(X) + + assert_array_equal(clust.labels_, np.r_[[0] * 4, [1] * 4, -1]) + + C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) + C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) + X = np.vstack((C1, np.array([[100, 200], [200, 300]]), C2)) + clust = OPTICS(min_samples=3).fit(X) + + assert_array_equal(clust.labels_, np.r_[[0] * 4, -1, -1, [1] * 4]) + + @pytest.mark.parametrize("reach, n_child, members", [ (np.array([np.inf, 0.9, 0.9, 1.0, 0.89, 0.88, 10, .9, .9, .9, 10, 0.9, 0.9, 0.89, 0.88, 10, .9, .9, .9, .9]), 2, np.r_[0:6]), @@ -199,7 +219,7 @@ def test_cluster_sigmin_pruning(reach, n_child, members): # Build cluster tree inplace on root node _cluster_tree(root, None, cluster_boundaries, reach, ordering, - 5, .75, .7, .4, .3) + 5, .75, .7, .4, .3, 1) assert_equal(root.split_point, cluster_boundaries[0]) assert_equal(n_child, len(root.children)) assert_array_equal(members, root.children[0].points)