8000 CLN `_hdbscan/_tree.pyx` algorithms refactor (#26011) · scikit-learn/scikit-learn@3041b48 · GitHub
[go: up one dir, main page]

Skip to content

Commit 3041b48

Browse files
authored
CLN _hdbscan/_tree.pyx algorithms refactor (#26011)
1 parent 7325d62 commit 3041b48

File tree

1 file changed

+19
-54
lines changed

1 file changed

+19
-54
lines changed

sklearn/cluster/_hdbscan/_tree.pyx

Lines changed: 19 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -212,55 +212,32 @@ cdef dict _compute_stability(
212212
cdef:
213213
cnp.float64_t[::1] result, births
214214
cnp.intp_t[:] parents = condensed_tree['parent']
215-
cnp.float64_t[:] lambdas = condensed_tree['value']
216-
cnp.intp_t[:] sizes = condensed_tree['cluster_size']
217215

218216
cnp.intp_t parent, cluster_size, result_index
219-
cnp.float64_t lambda_val, child_size
217+
cnp.float64_t lambda_val
218+
CONDENSED_t condensed_node
220219
cnp.float64_t[:, :] result_pre_dict
221220
cnp.intp_t largest_child = condensed_tree['child'].max()
222221
cnp.intp_t smallest_cluster = np.min(parents)
223222
cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1
224-
cnp.ndarray sorted_child_data = np.sort(condensed_tree[['child', 'value']], axis=0)
225-
cnp.intp_t[:] sorted_children = sorted_child_data['child'].copy()
226-
cnp.float64_t[:] sorted_lambdas = sorted_child_data['value'].copy()
227-
cnp.intp_t child, current_child = -1
228-
cnp.float64_t min_lambda = 0
229223

230224
largest_child = max(largest_child, smallest_cluster)
231225
births = np.full(largest_child + 1, np.nan, dtype=np.float64)
232226

233-
if largest_child < smallest_cluster:
234-
largest_child = smallest_cluster
235-
236227
births = np.full(largest_child + 1, np.nan, dtype=np.float64)
237-
for idx in range(condensed_tree.shape[0]):
238-
child = sorted_children[idx]
239-
lambda_val = sorted_lambdas[idx]
240-
241-
if child == current_child:
242-
min_lambda = min(min_lambda, lambda_val)
243-
elif current_child != -1:
244-
births[current_child] = min_lambda
245-
current_child = child
246-
min_lambda = lambda_val
247-
else:
248-
# Initialize
249-
current_child = child
250-
min_lambda = lambda_val
228+
for condensed_node in condensed_tree:
229+
births[condensed_node.child] = condensed_node.value
251230

252-
if current_child != -1:
253-
births[current_child] = min_lambda
254231
births[smallest_cluster] = 0.0
255232

256233
result = np.zeros(num_clusters, dtype=np.float64)
257-
for idx in range(condensed_tree.shape[0]):
258-
parent = parents[idx]
259-
lambda_val = lambdas[idx]
260-
child_size = sizes[idx]
234+
for condensed_node in condensed_tree:
235+
parent = condensed_node.parent
236+
lambda_val = condensed_node.value
237+
cluster_size = condensed_node.cluster_size
261238

262239
result_index = parent - smallest_cluster
263-
result[result_index] += (lambda_val - births[parent]) * child_size
240+
result[result_index] += (lambda_val - births[parent]) * cluster_size
264241

265242
result_pre_dict = np.vstack(
266243
(
@@ -293,42 +270,30 @@ cdef list bfs_from_cluster_tree(
293270
return result
294271

295272

296-
cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] hierarchy):
273+
cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree):
297274

298275
cdef:
299-
cnp.ndarray sorted_parent_data
300-
cnp.intp_t[:] sorted_parents
301-
cnp.float64_t[:] sorted_lambdas
302-
cnp.float64_t[::1] deaths
303-
cnp.intp_t parent, current_parent
276+
cnp.intp_t parent, current_parent, idx
304277
cnp.float64_t lambda_val, max_lambda
305-
cnp.intp_t largest_parent = hierarchy['parent'].max()
278+
cnp.float64_t[::1] deaths
279+
cnp.intp_t largest_parent = condensed_tree['parent'].max()
306280

307-
sorted_parent_data = np.sort(hierarchy[['parent', 'value']], axis=0)
308281
deaths = np.zeros(largest_parent + 1, dtype=np.float64)
309-
sorted_parents = sorted_parent_data['parent']
310-
sorted_lambdas = sorted_parent_data['value']
311-
312-
current_parent = -1
313-
max_lambda = 0
282+
current_parent = condensed_tree[0].parent
283+
max_lambda = condensed_tree[0].value
314284

315-
for row in range(sorted_parent_data.shape[0]):
316-
parent = sorted_parents[row]
317-
lambda_val = sorted_lambdas[row]
285+
for idx in range(1, condensed_tree.shape[0]):
286+
parent = condensed_tree[idx].parent
287+
lambda_val = condensed_tree[idx].value
318288

319289
if parent == current_parent:
320290
max_lambda = max(max_lambda, lambda_val)
321-
elif current_parent != -1:
322-
deaths[current_parent] = max_lambda
323-
current_parent = parent
324-
max_lambda = lambda_val
325291
else:
326-
# Initialize
292+
deaths[current_parent] = max_lambda
327293
current_parent = parent
328294
max_lambda = lambda_val
329295

330296
deaths[current_parent] = max_lambda # value for last parent
331-
332297
return deaths
333298

334299

0 commit comments

Comments
 (0)
0