diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 3d6bed3e8df34..0e493f28379eb 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -4,45 +4,51 @@ import numpy as np -cimport numpy as np +cimport numpy as cnp import cython -cdef np.double_t INFTY = np.inf +cdef cnp.float64_t INFTY = np.inf +cdef cnp.intp_t NOISE = -1 -cdef list bfs_from_hierarchy(np.ndarray[np.double_t, ndim=2] hierarchy, - np.intp_t bfs_root): +cdef list bfs_from_hierarchy( + cnp.ndarray[cnp.float64_t, ndim=2] hierarchy, + cnp.intp_t bfs_root +): """ Perform a breadth first search on a tree in scipy hclust format. """ - cdef list to_process - cdef np.intp_t max_node - cdef np.intp_t num_points - cdef np.intp_t dim - - dim = hierarchy.shape[0] - max_node = 2 * dim - num_points = max_node - dim + 1 - - to_process = [bfs_root] + cdef list process_queue, next_queue + cdef cnp.intp_t n_samples = hierarchy.shape[0] + 1 + cdef cnp.intp_t node + process_queue = [bfs_root] result = [] - while to_process: - result.extend(to_process) - to_process = [x - num_points for x in - to_process if x >= num_points] - if to_process: - to_process = hierarchy[to_process, - :2].flatten().astype(np.intp).tolist() + while process_queue: + result.extend(process_queue) + process_queue = [ + x - n_samples + for x in process_queue + if x >= n_samples + ] + if process_queue: + process_queue = ( + hierarchy[process_queue, :2] + .flatten() + .astype(np.intp) + .tolist() + ) return result -cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy, - np.intp_t min_cluster_size=10): +cpdef cnp.ndarray condense_tree( + cnp.ndarray[cnp.float64_t, ndim=2] hierarchy, + cnp.intp_t min_cluster_size=10 +): """Condense a tree according to a minimum cluster size. This is akin to the runt pruning procedure of Stuetzle. The result is a much simpler tree that is easier to visualize. We include extra information on the @@ -51,12 +57,12 @@ cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy, Parameters ---------- - hierarchy : ndarray (n_samples, 4) + hierarchy : ndarray (n_samples - 1, 4) A single linkage hierarchy in scipy.cluster.hierarchy format. min_cluster_size : int, optional (default 10) - The minimum size of clusters to consider. Smaller "runt" - clusters are pruned from the tree. + The minimum size of clusters to consider. Clusters smaler than this + are pruned from the tree. Returns ------- @@ -65,217 +71,191 @@ cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy, and child_size in each row providing a tree structure. """ - cdef np.intp_t root - cdef np.intp_t num_points - cdef np.intp_t next_label - cdef list node_list - cdef list result_list - - cdef np.ndarray[np.intp_t, ndim=1] relabel - cdef np.ndarray[np.int_t, ndim=1] ignore - cdef np.ndarray[np.double_t, ndim=1] children + cdef: + cnp.intp_t root = 2 * hierarchy.shape[0] + cnp.intp_t n_samples = hierarchy.shape[0] + 1 + cnp.intp_t next_label = n_samples + 1 + list result_list, node_list = bfs_from_hierarchy(hierarchy, root) - cdef np.intp_t node - cdef np.intp_t sub_node - cdef np.intp_t left - cdef np.intp_t right - cdef double lambda_value - cdef np.intp_t left_count - cdef np.intp_t right_count - - root = 2 * hierarchy.shape[0] - num_points = root // 2 + 1 - next_label = num_points + 1 - - node_list = bfs_from_hierarchy(hierarchy, root) + cnp.intp_t[::1] relabel + cnp.uint8_t[::1] ignore + cnp.intp_t node, sub_node, left, right + cnp.float64_t lambda_value, distance + cnp.intp_t left_count, right_count relabel = np.empty(root + 1, dtype=np.intp) - relabel[root] = num_points + relabel[root] = n_samples result_list = [] - ignore = np.zeros(len(node_list), dtype=int) + ignore = np.zeros(len(node_list), dtype=bool) for node in node_list: - if ignore[node] or node < num_points: + if ignore[node] or node < n_samples: continue - children = hierarchy[node - num_points] - left = children[0] - right = children[1] - if children[2] > 0.0: - lambda_value = 1.0 / children[2] + children = hierarchy[node - n_samples] + left = children[0] + right = children[1] + distance = children[2] + if distance > 0.0: + lambda_value = 1.0 / distance else: lambda_value = INFTY - if left >= num_points: - left_count = hierarchy[left - num_points][3] + if left >= n_samples: + left_count = hierarchy[left - n_samples][3] else: left_count = 1 - if right >= num_points: - right_count = hierarchy[right - num_points][3] + if right >= n_samples: + right_count = hierarchy[right - n_samples][3] else: right_count = 1 if left_count >= min_cluster_size and right_count >= min_cluster_size: relabel[left] = next_label next_label += 1 - result_list.append((relabel[node], relabel[left], lambda_value, - left_count)) + result_list.append( + (relabel[node], relabel[left], lambda_value, left_count) + ) relabel[right] = next_label next_label += 1 - result_list.append((relabel[node], relabel[right], lambda_value, - right_count)) + result_list.append( + (relabel[node], relabel[right], lambda_value, right_count) + ) elif left_count < min_cluster_size and right_count < min_cluster_size: for sub_node in bfs_from_hierarchy(hierarchy, left): - if sub_node < num_points: - result_list.append((relabel[node], sub_node, - lambda_value, 1)) + if sub_node < n_samples: + result_list.append( + (relabel[node], sub_node, lambda_value, 1) + ) ignore[sub_node] = True for sub_node in bfs_from_hierarchy(hierarchy, right): - if sub_node < num_points: - result_list.append((relabel[node], sub_node, - lambda_value, 1)) + if sub_node < n_samples: + result_list.append( + (relabel[node], sub_node, lambda_value, 1) + ) ignore[sub_node] = True elif left_count < min_cluster_size: relabel[right] = relabel[node] for sub_node in bfs_from_hierarchy(hierarchy, left): - if sub_node < num_points: - result_list.append((relabel[node], sub_node, - lambda_value, 1)) + if sub_node < n_samples: + result_list.append( + (relabel[node], sub_node, lambda_value, 1) + ) ignore[sub_node] = True else: relabel[left] = relabel[node] for sub_node in bfs_from_hierarchy(hierarchy, right): - if sub_node < num_points: - result_list.append((relabel[node], sub_node, - lambda_value, 1)) + if sub_node < n_samples: + result_list.append( + (relabel[node], sub_node, lambda_value, 1) + ) ignore[sub_node] = True return np.array(result_list, dtype=[('parent', np.intp), ('child', np.intp), - ('lambda_val', float), + ('lambda_val', np.float64), ('child_size', np.intp)]) -cpdef dict compute_stability(np.ndarray condensed_tree): - - cdef np.ndarray[np.double_t, ndim=1] result_arr - cdef np.ndarray sorted_child_data - cdef np.ndarray[np.intp_t, ndim=1] sorted_children - cdef np.ndarray[np.double_t, ndim=1] sorted_lambdas +cpdef dict compute_stability(cnp.ndarray condensed_tree): - cdef np.ndarray[np.intp_t, ndim=1] parents - cdef np.ndarray[np.intp_t, ndim=1] sizes - cdef np.ndarray[np.double_t, ndim=1] lambdas + cdef: + cnp.float64_t[::1] result, births + cnp.ndarray condensed_node + cnp.intp_t[:] parents = condensed_tree['parent'] + cnp.float64_t[:] lambdas = condensed_tree['lambda_val'] + cnp.intp_t[:] sizes = condensed_tree['child_size'] - cdef np.intp_t child - cdef np.intp_t parent - cdef np.intp_t child_size - cdef np.intp_t result_index - cdef np.intp_t current_child - cdef np.float64_t lambda_ - cdef np.float64_t min_lambda + cnp.intp_t parent, cluster_size, result_index + cnp.float64_t lambda_val + cnp.float64_t[:, :] result_pre_dict + cnp.intp_t largest_child = condensed_tree['child'].max() + cnp.intp_t smallest_cluster = np.min(parents) + cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1 + cnp.ndarray sorted_child_data = np.sort(condensed_tree[['child', 'lambda_val']], axis=0) + cnp.intp_t[:] sorted_children = sorted_child_data['child'].copy() + cnp.float64_t[:] sorted_lambdas = sorted_child_data['lambda_val'].copy() - cdef np.ndarray[np.double_t, ndim=1] births_arr - cdef np.double_t *births - - cdef np.intp_t largest_child = condensed_tree['child'].max() - cdef np.intp_t smallest_cluster = condensed_tree['parent'].min() - cdef np.intp_t num_clusters = (condensed_tree['parent'].max() - - smallest_cluster + 1) + largest_child = max(largest_child, smallest_cluster) + births = np.full(largest_child + 1, np.nan, dtype=np.float64) if largest_child < smallest_cluster: largest_child = smallest_cluster - sorted_child_data = np.sort(condensed_tree[['child', 'lambda_val']], - axis=0) - births_arr = np.nan * np.ones(largest_child + 1, dtype=np.double) - births = ( births_arr.data) - sorted_children = sorted_child_data['child'].copy() - sorted_lambdas = sorted_child_data['lambda_val'].copy() - - parents = condensed_tree['parent'] - sizes = condensed_tree['child_size'] - lambdas = condensed_tree['lambda_val'] - + births = np.nan * np.ones(largest_child + 1, dtype=np.float64) current_child = -1 min_lambda = 0 - - for row in range(sorted_child_data.shape[0]): - child = sorted_children[row] - lambda_ = sorted_lambdas[row] + for idx in range(condensed_tree.shape[0]): + child = sorted_children[idx] + lambda_val = sorted_lambdas[idx] if child == current_child: - min_lambda = min(min_lambda, lambda_) + min_lambda = min(min_lambda, lambda_val) elif current_child != -1: births[current_child] = min_lambda current_child = child - min_lambda = lambda_ + min_lambda = lambda_val else: # Initialize current_child = child - min_lambda = lambda_ + min_lambda = lambda_val if current_child != -1: births[current_child] = min_lambda births[smallest_cluster] = 0.0 - result_arr = np.zeros(num_clusters, dtype=np.double) + result = np.zeros(num_clusters, dtype=np.float64) + for idx in range(condensed_tree.shape[0]): + parent = parents[idx] + lambda_val = lambdas[idx] + child_size = sizes[idx] - for i in range(condensed_tree.shape[0]): - parent = parents[i] - lambda_ = lambdas[i] - child_size = sizes[i] result_index = parent - smallest_cluster + result[result_index] += (lambda_val - births[parent]) * child_size - result_arr[result_index] += (lambda_ - births[parent]) * child_size - - result_pre_dict = np.vstack((np.arange(smallest_cluster, - condensed_tree['parent'].max() + 1), - result_arr)).T + result_pre_dict = np.vstack( + ( + np.arange(smallest_cluster, np.max(parents) + 1), + result + ) + ).T return dict(result_pre_dict) -cdef list bfs_from_cluster_tree(np.ndarray tree, np.intp_t bfs_root): +cdef list bfs_from_cluster_tree(cnp.ndarray hierarchy, cnp.intp_t bfs_root): cdef list result - cdef np.ndarray[np.intp_t, ndim=1] to_process + cdef cnp.ndarray[cnp.intp_t, ndim=1] to_process result = [] to_process = np.array([bfs_root], dtype=np.intp) while to_process.shape[0] > 0: result.extend(to_process.tolist()) - to_process = tree['child'][np.in1d(tree['parent'], to_process)] + to_process = hierarchy['child'][np.in1d(hierarchy['parent'], to_process)] return result -cdef max_lambdas(np.ndarray tree): - - cdef np.ndarray sorted_parent_data - cdef np.ndarray[np.intp_t, ndim=1] sorted_parents - cdef np.ndarray[np.double_t, ndim=1] sorted_lambdas +cdef max_lambdas(cnp.ndarray hierarchy): - cdef np.intp_t parent - cdef np.intp_t current_parent - cdef np.float64_t lambda_ - cdef np.float64_t max_lambda + cdef: + cnp.ndarray sorted_parent_data + cnp.intp_t[:] sorted_parents + cnp.float64_t[:] sorted_lambdas, deaths + cnp.intp_t parent, current_parent + cnp.float64_t lambda_val, max_lambda + cnp.intp_t largest_parent = hierarchy['parent'].max() - cdef np.ndarray[np.double_t, ndim=1] deaths_arr - cdef np.double_t *deaths - - cdef np.intp_t largest_parent = tree['parent'].max() - - sorted_parent_data = np.sort(tree[['parent', 'lambda_val']], axis=0) - deaths_arr = np.zeros(largest_parent + 1, dtype=np.double) - deaths = ( deaths_arr.data) + sorted_parent_data = np.sort(hierarchy[['parent', 'lambda_val']], axis=0) + deaths = np.zeros(largest_parent + 1, dtype=np.float64) sorted_parents = sorted_parent_data['parent'] sorted_lambdas = sorted_parent_data['lambda_val'] @@ -283,41 +263,40 @@ cdef max_lambdas(np.ndarray tree): max_lambda = 0 for row in range(sorted_parent_data.shape[0]): - parent = sorted_parents[row] - lambda_ = sorted_lambdas[row] + parent = sorted_parents[row] + lambda_val = sorted_lambdas[row] if parent == current_parent: - max_lambda = max(max_lambda, lambda_) + max_lambda = max(max_lambda, lambda_val) elif current_parent != -1: deaths[current_parent] = max_lambda current_parent = parent - max_lambda = lambda_ + max_lambda = lambda_val else: # Initialize current_parent = parent - max_lambda = lambda_ + max_lambda = lambda_val deaths[current_parent] = max_lambda # value for last parent - return deaths_arr + return deaths cdef class TreeUnionFind (object): - cdef np.ndarray _data_arr - cdef np.intp_t[:, ::1] _data - cdef np.ndarray is_component + cdef cnp.ndarray _data_arr + cdef cnp.intp_t[:, ::1] _data + cdef cnp.ndarray is_component def __init__(self, size): self._data_arr = np.zeros((size, 2), dtype=np.intp) self._data_arr.T[0] = np.arange(size) - self._data = ( ( - self._data_arr.data)) + self._data = self._data_arr self.is_component = np.ones(size, dtype=bool) - cdef union_(self, np.intp_t x, np.intp_t y): - cdef np.intp_t x_root = self.find(x) - cdef np.intp_t y_root = self.find(y) + cdef union_(self, cnp.intp_t x, cnp.intp_t y): + cdef cnp.intp_t x_root = self.find(x) + cdef cnp.intp_t y_root = self.find(y) if self._data[x_root, 1] < self._data[y_root, 1]: self._data[x_root, 0] = y_root @@ -329,20 +308,21 @@ cdef class TreeUnionFind (object): return - cdef find(self, np.intp_t x): + cdef find(self, cnp.intp_t x): if self._data[x, 0] != x: self._data[x, 0] = self.find(self._data[x, 0]) self.is_component[x] = False return self._data[x, 0] - cdef np.ndarray[np.intp_t, ndim=1] components(self): + cdef cnp.ndarray[cnp.intp_t, ndim=1] components(self): return self.is_component.nonzero()[0] -cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut( - np.ndarray linkage, - np.double_t cut, - np.intp_t min_cluster_size): +cpdef cnp.ndarray[cnp.intp_t, ndim=1] labelling_at_cut( + cnp.ndarray linkage, + cnp.float64_t cut, + cnp.intp_t min_cluster_size +): """Given a single linkage tree and a cut value, return the vector of cluster labels at that cut value. This is useful for Robust Single Linkage, and extracting DBSCAN results @@ -350,7 +330,7 @@ cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut( Parameters ---------- - linkage : ndarray (n_samples, 4) + linkage : ndarray (n_samples - 1, 4) The single linkage tree in scipy.cluster.hierarchy format. cut : double @@ -362,90 +342,77 @@ cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut( Returns ------- - labels : ndarray (n_samples,) + labels : ndarray of shape (n_samples,) The cluster labels for each point in the data set; a label of -1 denotes a noise assignment. """ - cdef np.intp_t root - cdef np.intp_t num_points - cdef np.ndarray[np.intp_t, ndim=1] result_arr - cdef np.ndarray[np.intp_t, ndim=1] unique_labels - cdef np.ndarray[np.intp_t, ndim=1] cluster_size - cdef np.intp_t *result - cdef TreeUnionFind union_find - cdef np.intp_t n - cdef np.intp_t cluster - cdef np.intp_t cluster_id + cdef: + cnp.intp_t n, cluster, cluster_id, root, n_samples + cnp.ndarray[cnp.intp_t, ndim=1] result + cnp.intp_t[:] unique_labels, cluster_size + TreeUnionFind union_find root = 2 * linkage.shape[0] - num_points = root // 2 + 1 - - result_arr = np.empty(num_points, dtype=np.intp) - result = ( result_arr.data) + n_samples = root // 2 + 1 + result = np.empty(n_samples, dtype=np.intp) + union_find = TreeUnionFind( root + 1) - union_find = TreeUnionFind( root + 1) - - cluster = num_points + cluster = n_samples for row in linkage: if row[2] < cut: - union_find.union_( row[0], cluster) - union_find.union_( row[1], cluster) + union_find.union_( row[0], cluster) + union_find.union_( row[1], cluster) cluster += 1 cluster_size = np.zeros(cluster, dtype=np.intp) - for n in range(num_points): + for n in range(n_samples): cluster = union_find.find(n) cluster_size[cluster] += 1 result[n] = cluster - cluster_label_map = {-1: -1} + cluster_label_map = {-1: NOISE} cluster_label = 0 - unique_labels = np.unique(result_arr) + unique_labels = np.unique(result) for cluster in unique_labels: if cluster_size[cluster] < min_cluster_size: - cluster_label_map[cluster] = -1 + cluster_label_map[cluster] = NOISE else: cluster_label_map[cluster] = cluster_label cluster_label += 1 - for n in range(num_points): + for n in range(n_samples): result[n] = cluster_label_map[result[n]] - return result_arr + return result -cdef np.ndarray[np.intp_t, ndim=1] do_labelling( - np.ndarray tree, +cdef cnp.ndarray[cnp.intp_t, ndim=1] do_labelling( + cnp.ndarray hierarchy, set clusters, dict cluster_label_map, - np.intp_t allow_single_cluster, - np.double_t cluster_selection_epsilon): - - cdef np.intp_t root_cluster - cdef np.ndarray[np.intp_t, ndim=1] result_arr - cdef np.ndarray[np.intp_t, ndim=1] parent_array - cdef np.ndarray[np.intp_t, ndim=1] child_array - cdef np.ndarray[np.double_t, ndim=1] lambda_array - cdef np.intp_t *result - cdef TreeUnionFind union_find - cdef np.intp_t parent - cdef np.intp_t child - cdef np.intp_t n - cdef np.intp_t cluster - - child_array = tree['child'] - parent_array = tree['parent'] - lambda_array = tree['lambda_val'] - - root_cluster = parent_array.min() - result_arr = np.empty(root_cluster, dtype=np.intp) - result = ( result_arr.data) - - union_find = TreeUnionFind(parent_array.max() + 1) - - for n in range(tree.shape[0]): + cnp.intp_t allow_single_cluster, + cnp.float64_t cluster_selection_epsilon +): + + cdef: + cnp.intp_t root_cluster + cnp.ndarray[cnp.intp_t, ndim=1] result + cnp.intp_t[:] parent_array, child_array + cnp.float64_t[:] lambda_array + TreeUnionFind union_find + cnp.intp_t n, parent, child, cluster + + child_array = hierarchy['child'] + parent_array = hierarchy['parent'] + lambda_array = hierarchy['lambda_val'] + + root_cluster = np.min(parent_array) + result = np.empty(root_cluster, dtype=np.intp) + union_find = TreeUnionFind(np.max(parent_array) + 1) + + for n in range(hierarchy.shape[0]): child = child_array[n] parent = parent_array[n] if child not in clusters: @@ -454,57 +421,51 @@ cdef np.ndarray[np.intp_t, ndim=1] do_labelling( for n in range(root_cluster): cluster = union_find.find(n) if cluster < root_cluster: - result[n] = -1 + result[n] = NOISE elif cluster == root_cluster: if len(clusters) == 1 and allow_single_cluster: if cluster_selection_epsilon != 0.0: - if tree['lambda_val'][tree['child'] == n] >= 1 / cluster_selection_epsilon : + if hierarchy['lambda_val'][hierarchy['child'] == n] >= 1 / cluster_selection_epsilon : result[n] = cluster_label_map[cluster] else: - result[n] = -1 - elif tree['lambda_val'][tree['child'] == n] >= \ - tree['lambda_val'][tree['parent'] == cluster].max(): + result[n] = NOISE + elif hierarchy['lambda_val'][hierarchy['child'] == n] >= \ + hierarchy['lambda_val'][hierarchy['parent'] == cluster].max(): result[n] = cluster_label_map[cluster] else: - result[n] = -1 + result[n] = NOISE else: - result[n] = -1 + result[n] = NOISE else: result[n] = cluster_label_map[cluster] - return result_arr + return result -cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels): +cdef get_probabilities(cnp.ndarray hierarchy, dict cluster_map, cnp.ndarray labels): - cdef np.ndarray[np.double_t, ndim=1] result - cdef np.ndarray[np.double_t, ndim=1] deaths - cdef np.ndarray[np.double_t, ndim=1] lambda_array - cdef np.ndarray[np.intp_t, ndim=1] child_array - cdef np.ndarray[np.intp_t, ndim=1] parent_array - cdef np.intp_t root_cluster - cdef np.intp_t n - cdef np.intp_t point - cdef np.intp_t cluster_num - cdef np.intp_t cluster - cdef np.double_t max_lambda - cdef np.double_t lambda_ + cdef: + cnp.ndarray[cnp.float64_t, ndim=1] result + cnp.float64_t[:] lambda_array + cnp.float64_t[::1] deaths + cnp.intp_t[:] child_array, parent_array + cnp.intp_t root_cluster, n, point, cluster_num, cluster + cnp.float64_t max_lambda, lambda_val - child_array = tree['child'] - parent_array = tree['parent'] - lambda_array = tree['lambda_val'] + child_array = hierarchy['child'] + parent_array = hierarchy['parent'] + lambda_array = hierarchy['lambda_val'] result = np.zeros(labels.shape[0]) - deaths = max_lambdas(tree) - root_cluster = parent_array.min() + deaths = max_lambdas(hierarchy) + root_cluster = np.min(parent_array) - for n in range(tree.shape[0]): + for n in range(hierarchy.shape[0]): point = child_array[n] if point >= root_cluster: continue cluster_num = labels[point] - if cluster_num == -1: continue @@ -513,13 +474,16 @@ cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels): if max_lambda == 0.0 or not np.isfinite(lambda_array[n]): result[point] = 1.0 else: - lambda_ = min(lambda_array[n], max_lambda) - result[point] = lambda_ / max_lambda + lambda_val = min(lambda_array[n], max_lambda) + result[point] = lambda_val / max_lambda return result -cpdef list recurse_leaf_dfs(np.ndarray cluster_tree, np.intp_t current_node): +cpdef list recurse_leaf_dfs(cnp.ndarray cluster_tree, cnp.intp_t current_node): + cdef cnp.intp_t[:] children + cdef cnp.intp_t child + children = cluster_tree[cluster_tree['parent'] == current_node]['child'] if len(children) == 0: return [current_node,] @@ -527,13 +491,21 @@ cpdef list recurse_leaf_dfs(np.ndarray cluster_tree, np.intp_t current_node): return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], []) -cpdef list get_cluster_tree_leaves(np.ndarray cluster_tree): +cpdef list get_cluster_tree_leaves(cnp.ndarray cluster_tree): + cdef cnp.intp_t root if cluster_tree.shape[0] == 0: return [] root = cluster_tree['parent'].min() return recurse_leaf_dfs(cluster_tree, root) -cpdef np.intp_t traverse_upwards(np.ndarray cluster_tree, np.double_t cluster_selection_epsilon, np.intp_t leaf, np.intp_t allow_single_cluster): +cdef cnp.intp_t traverse_upwards( + cnp.ndarray cluster_tree, + cnp.float64_t cluster_selection_epsilon, + cnp.intp_t leaf, + cnp.intp_t allow_single_cluster +): + cdef cnp.intp_t root, parent + cdef cnp.float64_t parent_eps root = cluster_tree['parent'].min() parent = cluster_tree[cluster_tree['child'] == leaf]['parent'] @@ -547,18 +519,35 @@ cpdef np.intp_t traverse_upwards(np.ndarray cluster_tree, np.double_t cluster_se if parent_eps > cluster_selection_epsilon: return parent else: - return traverse_upwards(cluster_tree, cluster_selection_epsilon, parent, allow_single_cluster) - -cpdef set epsilon_search(set leaves, np.ndarray cluster_tree, np.double_t cluster_selection_epsilon, np.intp_t allow_single_cluster): - - selected_clusters = list() - processed = list() + return traverse_upwards( + cluster_tree, + cluster_selection_epsilon, + parent, + allow_single_cluster + ) + +cdef set epsilon_search( + set leaves, + cnp.ndarray cluster_tree, + cnp.float64_t cluster_selection_epsilon, + cnp.intp_t allow_single_cluster +): + cdef: + list selected_clusters = list() + list processed = list() + cnp.intp_t leaf, epsilon_child, sub_node + cnp.float64_t eps for leaf in leaves: eps = 1/cluster_tree['lambda_val'][cluster_tree['child'] == leaf][0] if eps < cluster_selection_epsilon: if leaf not in processed: - epsilon_child = traverse_upwards(cluster_tree, cluster_selection_epsilon, leaf, allow_single_cluster) + epsilon_child = traverse_upwards( + cluster_tree, + cluster_selection_epsilon, + leaf, + allow_single_cluster + ) selected_clusters.append(epsilon_child) for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child): @@ -570,11 +559,14 @@ cpdef set epsilon_search(set leaves, np.ndarray cluster_tree, np.double_t cluste return set(selected_clusters) @cython.wraparound(True) -cpdef tuple get_clusters(np.ndarray tree, dict stability, - cluster_selection_method='eom', - allow_single_cluster=False, - cluster_selection_epsilon=0.0, - max_cluster_size=None): +cpdef tuple get_clusters( + cnp.ndarray hierarchy, + dict stability, + cluster_selection_method='eom', + cnp.uint8_t allow_single_cluster=False, + cnp.float64_t cluster_selection_epsilon=0.0, + max_cluster_size=None +): """Given a tree and stability dict, produce the cluster labels (and probabilities) for a flat clustering based on the chosen cluster selection method. @@ -596,7 +588,7 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, Whether to allow a single cluster to be selected by the Excess of Mass algorithm. - cluster_selection_epsilon: float, optional (default 0.0) + cluster_selection_epsilon: double, optional (default 0.0) A distance threshold for cluster splits. max_cluster_size: int, default=None @@ -606,7 +598,7 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, Returns ------- - labels : ndarray (n_samples,) + labels : ndarray of shape (n_samples,) An integer array of cluster labels, with -1 denoting noise. probabilities : ndarray (n_samples,) @@ -615,18 +607,15 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, stabilities : ndarray (n_clusters,) The cluster coherence strengths of each cluster. """ - cdef list node_list - cdef np.ndarray cluster_tree - cdef np.ndarray child_selection - cdef dict is_cluster - cdef dict cluster_sizes - cdef float subtree_stability - cdef np.intp_t node - cdef np.intp_t sub_node - cdef np.intp_t cluster - cdef np.intp_t num_points - cdef np.ndarray labels - cdef np.double_t max_lambda + cdef: + list node_list + cnp.ndarray cluster_tree + cnp.uint8_t[:] child_selection + cnp.ndarray[cnp.intp_t, ndim=1] labels + dict is_cluster, cluster_sizes + cnp.float64_t subtree_stability, max_lambda + cnp.intp_t node, sub_node, cluster, n_samples + cnp.ndarray[cnp.float64_t, ndim=1] probs # Assume clusters are ordered by numeric id equivalent to # a topological sort of the tree; This is valid given the @@ -638,13 +627,13 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, node_list = sorted(stability.keys(), reverse=True)[:-1] # (exclude root) - cluster_tree = tree[tree['child_size'] > 1] + cluster_tree = hierarchy[hierarchy['child_size'] > 1] is_cluster = {cluster: True for cluster in node_list} - num_points = np.max(tree[tree['child_size'] == 1]['child']) + 1 - max_lambda = np.max(tree['lambda_val']) + n_samples = np.max(hierarchy[hierarchy['child_size'] == 1]['child']) + 1 + max_lambda = np.max(hierarchy['lambda_val']) if max_cluster_size is None: - max_cluster_size = num_points + 1 # Set to a value that will never be triggered + max_cluster_size = n_samples + 1 # Set to a value that will never be triggered cluster_sizes = {child: child_size for child, child_size in zip(cluster_tree['child'], cluster_tree['child_size'])} if allow_single_cluster: @@ -674,7 +663,12 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, if allow_single_cluster: selected_clusters = eom_clusters else: - selected_clusters = epsilon_search(set(eom_clusters), cluster_tree, cluster_selection_epsilon, allow_single_cluster) + selected_clusters = epsilon_search( + set(eom_clusters), + cluster_tree, + cluster_selection_epsilon, + allow_single_cluster + ) for c in is_cluster: if c in selected_clusters: is_cluster[c] = True @@ -686,10 +680,15 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, if len(leaves) == 0: for c in is_cluster: is_cluster[c] = False - is_cluster[tree['parent'].min()] = True + is_cluster[hierarchy['parent'].min()] = True if cluster_selection_epsilon != 0.0: - selected_clusters = epsilon_search(leaves, cluster_tree, cluster_selection_epsilon, allow_single_cluster) + selected_clusters = epsilon_search( + leaves, + cluster_tree, + cluster_selection_epsilon, + allow_single_cluster + ) else: selected_clusters = leaves @@ -698,16 +697,18 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, is_cluster[c] = True else: is_cluster[c] = False - else: - raise ValueError('Invalid Cluster Selection Method: %s\n' - 'Should be one of: "eom", "leaf"\n') clusters = set([c for c in is_cluster if is_cluster[c]]) cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))} reverse_cluster_map = {n: c for c, n in cluster_map.items()} - labels = do_labelling(tree, clusters, cluster_map, - allow_single_cluster, cluster_selection_epsilon) - probs = get_probabilities(tree, reverse_cluster_map, labels) + labels = do_labelling( + hierarchy, + clusters, + cluster_map, + allow_single_cluster, + cluster_selection_epsilon + ) + probs = get_probabilities(hierarchy, reverse_cluster_map, labels) return (labels, probs)