8000 ENH Add `np.float32` data support for `HDBSCAN` by Micky774 · Pull Request #26888 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH Add np.float32 data support for HDBSCAN #26888

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
sklearn/cluster/_hdbscan/linkage.pyx

# Default JupyterLite content
jupyterlite_contents
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def check_package_status(package, min_version):
{"sources": ["_k_means_minibatch.pyx"], "include_np": True},
],
"cluster._hdbscan": [
{"sources": ["_linkage.pyx"], "include_np": True},
{"sources": ["_linkage.pyx.tp"], "include_np": True},
{"sources": ["_reachability.pyx"], "include_np": True},
{"sources": ["_tree.pyx"], "include_np": True},
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,28 @@
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
{{py:

implementation_specific_values = [
# Values are the following ones:
#
# name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
('64', 'float64_t', 'np.float64'),
('32', 'float32_t', 'np.float32')
]

}}

cimport numpy as cnp
from libc.float cimport DBL_MAX
from cython cimport floating

import numpy as np
from ...metrics._dist_metrics cimport DistanceMetric64
from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric32, DistanceMetric64
from ...cluster._hierarchical_fast cimport UnionFind
from ...cluster._hdbscan._tree cimport HIERARCHY_t
from ...cluster._hdbscan._tree import HIERARCHY_dtype
from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t
from ...utils._typedefs cimport float32_t, float64_t, intp_t, int64_t, uint8_t

cdef extern from "numpy/arrayobject.h":
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
Expand Down Expand Up @@ -107,11 +119,23 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(

return mst


cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
const float64_t[:, ::1] raw_data,
const float64_t[::1] core_distances,
DistanceMetric64 dist_metric,
const floating[:, ::1] raw_data,
const floating[::1] core_distances,
DistanceMetric dist_metric,
float64_t alpha=1.0
):
if floating is double:
return mst_from_data_matrix64(raw_data, core_distances, dist_metric, alpha)
else:
return mst_from_data_matrix32(raw_data, core_distances, dist_metric, alpha)

{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}

cdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix{{name_suffix}}(
const {{INPUT_DTYPE_t}}[:, ::1] raw_data,
const {{INPUT_DTYPE_t}}[::1] core_distances,
DistanceMetric{{name_suffix}} dist_metric,
float64_t alpha=1.0
):
"""Compute the Minimum Spanning Tree (MST) representation of the mutual-
Expand Down Expand Up @@ -218,6 +242,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
current_node = new_node

return mst
{{endfor}}

cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_edge_t[::1] mst):
"""Construct a single-linkage tree from an MST.
Expand Down
12 changes: 7 additions & 5 deletions sklearn/cluster/_hdbscan/hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,10 +337,10 @@ def _hdbscan_prims(
n_jobs=n_jobs,
p=None,
).fit(X)

# TODO: Resume when {KD, Ball}Tree support 32-bit
neighbors_distances, _ = nbrs.kneighbors(X, min_samples, return_distance=True)
core_distances = np.ascontiguousarray(neighbors_distances[:, -1])
dist_metric = DistanceMetric.get_metric(metric, **metric_params)
dist_metric = DistanceMetric.get_metric(metric, dtype=X.dtype, **metric_params)

# Mutual reachability distance is implicit in mst_from_data_matrix
min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha)
Expand Down Expand Up @@ -701,7 +701,7 @@ def fit(self, X, y=None):
X,
accept_sparse=["csr", "lil"],
force_all_finite=False,
dtype=np.float64,
dtype=(np.float64, np.float32),
)
self._raw_data = X
all_finite = True
Expand Down Expand Up @@ -735,15 +735,17 @@ def fit(self, X, y=None):
X = self._validate_data(
X,
accept_sparse=["csr", "lil"],
dtype=np.float64,
dtype=(np.float64, np.float32),
)
else:
# Only non-sparse, precomputed distance matrices are handled here
# and thereby allowed to contain numpy.inf for missing distances

# Perform data validation after removing infinite values (numpy.inf)
# from the given distance matrix.
X = self._validate_data(X, force_all_finite=False, dtype=np.float64)
X = self._validate_data(
X, force_all_finite=False, dtype=(np.float64, np.float32)
)
if np.isnan(X).any():
# TODO: Support np.nan in Cython implementation for precomputed
# dense HDBSCAN
Expand Down
8 changes: 4 additions & 4 deletions sklearn/cluster/tests/test_hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,13 @@ def test_outlier_data(outlier_type):
assert_array_equal(clean_model.labels_, model.labels_[clean_indices])


def test_hdbscan_distance_matrix():
def test_hdbscan_distance_matrix(global_dtype):
"""
Tests that HDBSCAN works with precomputed distance matrices, and throws the
appropriate errors when needed.
"""
D = euclidean_distances(X)
D_original = D.copy()
D_original = D.copy().astype(global_dtype)
labels = HDBSCAN(metric="precomputed", copy=True).fit_predict(D)

assert_allclose(D, D_original)
Expand Down Expand Up @@ -118,12 +118,12 @@ def test_hdbscan_sparse_distance_matrix(sparse_constructor):
assert n_clusters == n_clusters_true


def test_hdbscan_feature_array():
def test_hdbscan_feature_array(global_dtype):
"""
Tests that HDBSCAN works with feature array, including an arbitrary
goodness of fit check. Note that the check is a simple heuristic.
"""
labels = HDBSCAN().fit_predict(X)
labels = HDBSCAN().fit_predict(X.astype(global_dtype))
n_clusters = len(set(labels) - OUTLIER_SET)
assert n_clusters == n_clusters_true

Expand Down
0