diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 61e074e56a657..ef2308997f898 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -95,6 +95,10 @@ Changelog See :func:`cluster.spectral_clustering` for more details. :pr:`21148` by :user:`Andrew Knyazev ` +- |Enhancement| Adds :term:`get_feature_names_out` to :class:`cluster.Birch`, + :class:`cluster.FeatureAgglomeration`, :class:`cluster.KMeans`, + :class:`cluster.MiniBatchKMeans`. :pr:`22255` by `Thomas Fan`_. + - |Efficiency| In :class:`cluster.KMeans`, the default ``algorithm`` is now ``"lloyd"`` which is the full classical EM-style algorithm. Both ``"auto"`` and ``"full"`` are deprecated and will be removed in version 1.3. They are diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 4bc49ea2301e6..68ab834202753 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -14,7 +14,7 @@ from scipy import sparse from scipy.sparse.csgraph import connected_components -from ..base import BaseEstimator, ClusterMixin +from ..base import BaseEstimator, ClusterMixin, _ClassNamePrefixFeaturesOutMixin from ..metrics.pairwise import paired_distances from ..metrics import DistanceMetric from ..metrics._dist_metrics import METRIC_MAPPING @@ -1054,7 +1054,9 @@ def fit_predict(self, X, y=None): return super().fit_predict(X, y) -class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): +class FeatureAgglomeration( + _ClassNamePrefixFeaturesOutMixin, AgglomerativeClustering, AgglomerationTransform +): """Agglomerate features. Recursively merges pair of clusters of features. @@ -1236,6 +1238,7 @@ def fit(self, X, y=None): """ X = self._validate_data(X, ensure_min_features=2) super()._fit(X.T) + self._n_features_out = self.n_clusters_ return self @property diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 8e86d8dd6ba08..3e47cc7b74492 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -11,7 +11,12 @@ from ..metrics import pairwise_distances_argmin from ..metrics.pairwise import euclidean_distances -from ..base import TransformerMixin, ClusterMixin, BaseEstimator +from ..base import ( + TransformerMixin, + ClusterMixin, + BaseEstimator, + _ClassNamePrefixFeaturesOutMixin, +) from ..utils.extmath import row_norms from ..utils import check_scalar, deprecated from ..utils.validation import check_is_fitted @@ -342,7 +347,9 @@ def radius(self): return sqrt(max(0, sq_radius)) -class Birch(ClusterMixin, TransformerMixin, BaseEstimator): +class Birch( + _ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator +): """Implements the BIRCH clustering algorithm. It is a memory-efficient, online-learning algorithm provided as an @@ -599,6 +606,7 @@ def _fit(self, X, partial): centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()]) self.subcluster_centers_ = centroids + self._n_features_out = self.subcluster_centers_.shape[0] self._global_clustering(X) return self diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index b631b1f77b26a..51d87044f5496 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -16,7 +16,12 @@ import numpy as np import scipy.sparse as sp -from ..base import BaseEstimator, ClusterMixin, TransformerMixin +from ..base import ( + BaseEstimator, + ClusterMixin, + TransformerMixin, + _ClassNamePrefixFeaturesOutMixin, +) from ..metrics.pairwise import euclidean_distances from ..metrics.pairwise import _euclidean_distances from ..utils.extmath import row_norms, stable_cumsum @@ -767,7 +772,9 @@ def _labels_inertia_threadpool_limit( return labels, inertia -class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): +class KMeans( + _ClassNamePrefixFeaturesOutMixin, TransformerMixin, ClusterMixin, BaseEstimator +): """K-Means clustering. Read more in the :ref:`User Guide `. @@ -1240,6 +1247,7 @@ def fit(self, X, y=None, sample_weight=None): ) self.cluster_centers_ = best_centers + self._n_features_out = self.cluster_centers_.shape[0] self.labels_ = best_labels self.inertia_ = best_inertia self.n_iter_ = best_n_iter @@ -2020,6 +2028,7 @@ def fit(self, X, y=None, sample_weight=None): break self.cluster_centers_ = centers + self._n_features_out = self.cluster_centers_.shape[0] self.n_steps_ = i + 1 self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples)) @@ -2134,6 +2143,7 @@ def partial_fit(self, X, y=None, sample_weight=None): ) self.n_steps_ += 1 + self._n_features_out = self.cluster_centers_.shape[0] return self diff --git a/sklearn/cluster/tests/test_birch.py b/sklearn/cluster/tests/test_birch.py index 5d8a3222ef156..4e64524e2cb11 100644 --- a/sklearn/cluster/tests/test_birch.py +++ b/sklearn/cluster/tests/test_birch.py @@ -219,3 +219,14 @@ def test_birch_params_validation(params, err_type, err_msg): X, _ = make_blobs(n_samples=80, centers=4) with pytest.raises(err_type, match=err_msg): Birch(**params).fit(X) + + +def test_feature_names_out(): + """Check `get_feature_names_out` for `Birch`.""" + X, _ = make_blobs(n_samples=80, n_features=4, random_state=0) + brc = Birch(n_clusters=4) + brc.fit(X) + n_clusters = brc.subcluster_centers_.shape[0] + + names_out = brc.get_feature_names_out() + assert_array_equal([f"birch{i}" for i in range(n_clusters)], names_out) diff --git a/sklearn/cluster/tests/test_feature_agglomeration.py b/sklearn/cluster/tests/test_feature_agglomeration.py index 6d9a942e3dcfe..1f61093a9568d 100644 --- a/sklearn/cluster/tests/test_feature_agglomeration.py +++ b/sklearn/cluster/tests/test_feature_agglomeration.py @@ -4,8 +4,11 @@ # Authors: Sergul Aydore 2017 import numpy as np import pytest + +from numpy.testing import assert_array_equal from sklearn.cluster import FeatureAgglomeration from sklearn.utils._testing import assert_array_almost_equal +from sklearn.datasets import make_blobs def test_feature_agglomeration(): @@ -41,3 +44,16 @@ def test_feature_agglomeration(): assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean) assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median) + + +def test_feature_agglomeration_feature_names_out(): + """Check `get_feature_names_out` for `FeatureAgglomeration`.""" + X, _ = make_blobs(n_features=6, random_state=0) + agglo = FeatureAgglomeration(n_clusters=3) + agglo.fit(X) + n_clusters = agglo.n_clusters_ + + names_out = agglo.get_feature_names_out() + assert_array_equal( + [f"featureagglomeration{i}" for i in range(n_clusters)], names_out + ) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 6e395778418c8..2d62aaaba96e9 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -1205,3 +1205,18 @@ def test_is_same_clustering(): # mapped to a same value labels3 = np.array([1, 0, 0, 2, 2, 0, 2, 1], dtype=np.int32) assert not _is_same_clustering(labels1, labels3, 3) + + +@pytest.mark.parametrize( + "Klass, method", + [(KMeans, "fit"), (MiniBatchKMeans, "fit"), (MiniBatchKMeans, "partial_fit")], +) +def test_feature_names_out(Klass, method): + """Check `feature_names_out` for `KMeans` and `MiniBatchKMeans`.""" + class_name = Klass.__name__.lower() + kmeans = Klass() + getattr(kmeans, method)(X) + n_clusters = kmeans.cluster_centers_.shape[0] + + names_out = kmeans.get_feature_names_out() + assert_array_equal([f"{class_name}{i}" for i in range(n_clusters)], names_out) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index a8178a4219485..7f6f99dcc8b67 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -380,7 +380,6 @@ def test_pandas_column_name_consistency(estimator): # TODO: As more modules support get_feature_names_out they should be removed # from this list to be tested GET_FEATURES_OUT_MODULES_TO_IGNORE = [ - "cluster", "ensemble", "isotonic", "kernel_approximation",