diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index e2b18cd0149a2..c31d818786d32 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -196,6 +196,9 @@ Changelog :class:`cluster.AgglomerativeClustering` and will be renamed to `metric` in v1.4. :pr:`23470` by :user:`Meekail Zain `. +- |Enhancement| :class:`cluster.FeatureAgglomeration` preserves dtype for + `numpy.float32`. :pr:`24346` by :user:`LinTiong Lau `. + - |Fix| :class:`cluster.KMeans` now supports readonly attributes when predicting. :pr:`24258` by `Thomas Fan`_ diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index ec54a915cc17a..1712988baf513 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -1322,7 +1322,9 @@ def fit(self, X, y=None): Returns the transformer. """ self._validate_params() - X = self._validate_data(X, ensure_min_features=2) + X = self._validate_data( + X, ensure_min_features=2, dtype=[np.float64, np.float32] + ) super()._fit(X.T) self._n_features_out = self.n_clusters_ return self @@ -1331,3 +1333,6 @@ def fit(self, X, y=None): def fit_predict(self): """Fit and return the result of each sample's clustering assignment.""" raise AttributeError + + def _more_tags(self): + return {"preserves_dtype": [np.float64, np.float32]} diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py index 457a83dd41e71..553d865ec8cf1 100644 --- a/sklearn/cluster/_feature_agglomeration.py +++ b/sklearn/cluster/_feature_agglomeration.py @@ -38,20 +38,21 @@ def transform(self, X): """ check_is_fitted(self) - X = self._validate_data(X, reset=False) + X = self._validate_data(X, reset=False, dtype=[np.float64, np.float32]) if self.pooling_func == np.mean and not issparse(X): size = np.bincount(self.labels_) n_samples = X.shape[0] # a fast way to compute the mean of grouped features nX = np.array( - [np.bincount(self.labels_, X[i, :]) / size for i in range(n_samples)] + [np.bincount(self.labels_, X[i, :]) / size for i in range(n_samples)], + dtype=X.dtype, ) else: nX = [ self.pooling_func(X[:, self.labels_ == l], axis=1) for l in np.unique(self.labels_) ] - nX = np.array(nX).T + nX = np.array(nX, dtype=X.dtype).T return nX def inverse_transform(self, Xred): diff --git a/sklearn/cluster/tests/test_feature_agglomeration.py b/sklearn/cluster/tests/test_feature_agglomeration.py index 3e4aa816b79c0..e6b6843164898 100644 --- a/sklearn/cluster/tests/test_feature_agglomeration.py +++ b/sklearn/cluster/tests/test_feature_agglomeration.py @@ -6,7 +6,7 @@ from numpy.testing import assert_array_equal from sklearn.cluster import FeatureAgglomeration -from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._testing import assert_allclose, assert_array_almost_equal from sklearn.datasets import make_blobs @@ -53,3 +53,18 @@ def test_feature_agglomeration_feature_names_out(): assert_array_equal( [f"featureagglomeration{i}" for i in range(n_clusters)], names_out ) + + +def test_feature_agglomeration_numerical_consistency(global_random_seed): + """Ensure numerical consistency among np.float32 and np.float64""" + rng = np.random.RandomState(global_random_seed) + X_64, _ = make_blobs(n_features=12, random_state=rng) + X_32 = X_64.astype(np.float32) + + agglo_32 = FeatureAgglomeration(n_clusters=3) + agglo_64 = FeatureAgglomeration(n_clusters=3) + + X_trans_64 = agglo_64.fit_transform(X_64) + X_trans_32 = agglo_32.fit_transform(X_32) + + assert_allclose(X_trans_32, X_trans_64)