8000 ENH Adds get_feature_names to cluster module by thomasjpfan · Pull Request #22255 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH Adds get_feature_names to cluster module #22255

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
8000
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ Changelog
See :func:`cluster.spectral_clustering` for more details.
:pr:`21148` by :user:`Andrew Knyazev <lobpcg>`

- |Enhancement| Adds :term:`get_feature_names_out` to :class:`cluster.Birch`,
:class:`cluster.FeatureAgglomeration`, :class:`cluster.KMeans`,
:class:`cluster.MiniBatchKMeans`. :pr:`22255` by `Thomas Fan`_.

- |Efficiency| In :class:`cluster.KMeans`, the default ``algorithm`` is now
``"lloyd"`` which is the full classical EM-style algorithm. Both ``"auto"``
and ``"full"`` are deprecated and will be removed in version 1.3. They are
Expand Down
7 changes: 5 additions & 2 deletions sklearn/cluster/_agglomerative.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from scipy import sparse
from scipy.sparse.csgraph import connected_components

from ..base import BaseEstimator, ClusterMixin
from ..base import BaseEstimator, ClusterMixin, _ClassNamePrefixFeaturesOutMixin
from ..metrics.pairwise import paired_distances
from ..metrics import DistanceMetric
from ..metrics._dist_metrics import METRIC_MAPPING
Expand Down Expand Up @@ -1054,7 +1054,9 @@ def fit_predict(self, X, y=None):
return super().fit_predict(X, y)


class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
class FeatureAgglomeration(
_ClassNamePrefixFeaturesOutMixin, AgglomerativeClustering, AgglomerationTransform
):
"""Agglomerate features.

Recursively merges pair of clusters of features.
Expand Down Expand Up @@ -1236,6 +1238,7 @@ def fit(self, X, y=None):
"""
X = self._validate_data(X, ensure_min_features=2)
super()._fit(X.T)
self._n_features_out = self.n_clusters_
return self

@property
Expand Down
12 changes: 10 additions & 2 deletions sklearn/cluster/_birch.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@

from ..metrics import pairwise_distances_argmin
from ..metrics.pairwise import euclidean_distances
from ..base import TransformerMixin, ClusterMixin, BaseEstimator
from ..base import (
TransformerMixin,
ClusterMixin,
BaseEstimator,
_ClassNamePrefixFeaturesOutMixin,
)
from ..utils.extmath import row_norms
from ..utils import check_scalar, deprecated
from ..utils.validation import check_is_fitted
Expand Down Expand Up @@ -342,7 +347,9 @@ def radius(self):
return sqrt(max(0, sq_radius))


class Birch(ClusterMixin, TransformerMixin, BaseEstimator):
class Birch(
_ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator
):
"""Implements the BIRCH clustering algorithm.

It is a memory-efficient, online-learning algorithm provided as an
Expand Down Expand Up @@ -599,6 +606,7 @@ def _fit(self, X, partial):

centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
self.subcluster_centers_ = centroids
self._n_features_out = self.subcluster_centers_.shape[0]

self._global_clustering(X)
return self
Expand Down
14 changes: 12 additions & 2 deletions sklearn/cluster/_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
import numpy as np
import scipy.sparse as sp

from ..base import BaseEstimator, ClusterMixin, TransformerMixin
from ..base import (
BaseEstimator,
ClusterMixin,
TransformerMixin,
_ClassNamePrefixFeaturesOutMixin,
)
from ..metrics.pairwise import euclidean_distances
from ..metrics.pairwise import _euclidean_distances
from ..utils.extmath import row_norms, stable_cumsum
Expand Down Expand Up @@ -767,7 +772,9 @@ def _labels_inertia_threadpool_limit(
return labels, inertia


class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
class KMeans(
_ClassNamePrefixFeaturesOutMixin, TransformerMixin, ClusterMixin, BaseEstimator
):
"""K-Means clustering.

Read more in the :ref:`User Guide <k_means>`.
Expand Down Expand Up @@ -1240,6 +1247,7 @@ def fit(self, X, y=None, sample_weight=None):
)

self.cluster_centers_ = best_centers
self._n_features_out = self.cluster_centers_.shape[0]
self.labels_ = best_labels
self.inertia_ = best_inertia
self.n_iter_ = best_n_iter
Expand Down Expand Up @@ -2020,6 +2028,7 @@ def fit(self, X, y=None, sample_weight=None):
break

self.cluster_centers_ = centers
self._n_features_out = self.cluster_centers_.shape[0]

self.n_steps_ = i + 1
self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples))
Expand Down Expand Up @@ -2134,6 +2143,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
)

self.n_steps_ += 1
self._n_features_out = self.cluster_centers_.shape[0]

return self

Expand Down
11 changes: 11 additions & 0 deletions sklearn/cluster/tests/test_birch.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,14 @@ def test_birch_params_validation(params, err_type, err_msg):
X, _ = make_blobs(n_samples=80, centers=4)
with pytest.raises(err_type, match=err_msg):
Birch(**params).fit(X)


def test_feature_names_out():
"""Check `get_feature_names_out` for `Birch`."""
X, _ = make_blobs(n_samples=80, n_features=4, random_state=0)
brc = Birch(n_clusters=4)
brc.fit(X)
n_clusters = brc.subcluster_centers_.shape[0]

names_out = brc.get_feature_names_out()
assert_array_equal([f"birch{i}" for i in range(n_clusters)], names_out)
16 changes: 16 additions & 0 deletions sklearn/cluster/tests/test_feature_agglomeration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
# Authors: Sergul Aydore 2017
import numpy as np
import pytest

from numpy.testing import assert_array_equal
from sklearn.cluster import FeatureAgglomeration
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.datasets import make_blobs


def test_feature_agglomeration():
Expand Down Expand Up @@ -41,3 +44,16 @@ def test_feature_agglomeration():

assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)


def test_feature_agglomeration_feature_names_out():
"""Check `get_feature_names_out` for `FeatureAgglomeration`."""
X, _ = make_blobs(n_features=6, random_state=0)
agglo = FeatureAgglomeration(n_clusters=3)
agglo.fit(X)
n_clusters = agglo.n_clusters_

names_out = agglo.get_feature_names_out()
assert_array_equal(
[f"featureagglomeration{i}" for i in range(n_clusters)], names_out
)
15 changes: 15 additions & 0 deletions sklearn/cluster/tests/test_k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,3 +1205,18 @@ def test_is_same_clustering():
# mapped to a same value
labels3 = np.array([1, 0, 0, 2, 2, 0, 2, 1], dtype=np.int32)
assert not _is_same_clustering(labels1, labels3, 3)


@pytest.mark.parametrize(
"Klass, method",
[(KMeans, "fit"), (MiniBatchKMeans, "fit"), (MiniBatchKMeans, "partial_fit")],
)
def test_feature_names_out(Klass, method):
"""Check `feature_names_out` for `KMeans` and `MiniBatchKMeans`."""
class_name = Klass.__name__.lower()
kmeans = Klass()
getattr(kmeans, method)(X)
n_clusters = kmeans.cluster_centers_.shape[0]

names_out = kmeans.get_feature_names_out()
assert_array_equal([f"{class_name}{i}" for i in range(n_clusters)], names_out)
1 change: 0 additions & 1 deletion sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,6 @@ def test_pandas_column_name_consistency(estimator):
# TODO: As more modules support get_feature_names_out they should be removed
# from this list to be tested
GET_FEATURES_OUT_MODULES_TO_IGNORE = [
"cluster",
"ensemble",
"isotonic",
"kernel_approximation",
Expand Down
0