diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 7141080afbc06..261b6f8eac6bf 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -58,6 +58,22 @@ Changelog - |Fix| :class:`decomposition.FastICA` now validates input parameters in `fit` instead of `__init__`. :pr:`21432` by :user:`Hannah Bohle ` and :user:`Maren Westermann `. +- |API| Adds :term:`get_feature_names_out` to all transformers in the + :mod:`~sklearn.decomposition` module: + :class:`~sklearn.decomposition.DictionaryLearning`, + :class:`~sklearn.decomposition.FactorAnalysis`, + :class:`~sklearn.decomposition.FastICA`, + :class:`~sklearn.decomposition.IncrementalPCA`, + :class:`~sklearn.decomposition.KernelPCA`, + :class:`~sklearn.decomposition.LatentDirichletAllocation`, + :class:`~sklearn.decomposition.MiniBatchDictionaryLearning`, + :class:`~sklearn.decomposition.MiniBatchSparsePCA`, + :class:`~sklearn.decomposition.NMF`, + :class:`~sklearn.decomposition.PCA`, + :class:`~sklearn.decomposition.SparsePCA`, + and :class:`~sklearn.decomposition.TruncatedSVD`. :pr:`21334` by + `Thomas Fan`_. + :mod:`sklearn.impute` ..................... diff --git a/sklearn/base.py b/sklearn/base.py index 241dac26dd3ca..557b2c25b2691 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -24,6 +24,8 @@ from .utils.validation import _check_y from .utils.validation import _num_features from .utils.validation import _check_feature_names_in +from .utils.validation import _generate_get_feature_names_out +from .utils.validation import check_is_fitted from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _get_feature_names @@ -879,6 +881,31 @@ def get_feature_names_out(self, input_features=None): return _check_feature_names_in(self, input_features) +class _ClassNamePrefixFeaturesOutMixin: + """Mixin class for transformers that generate their own names by prefixing. + + Assumes that `_n_features_out` is defined for the estimator. + """ + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "_n_features_out") + return _generate_get_feature_names_out( + self, self._n_features_out, input_features=input_features + ) + + class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index e503a52ee1f92..7904ce17f7212 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -11,12 +11,14 @@ import numpy as np from scipy import linalg -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..utils.validation import check_is_fitted from abc import ABCMeta, abstractmethod -class _BasePCA(TransformerMixin, BaseEstimator, metaclass=ABCMeta): +class _BasePCA( + _ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, metaclass=ABCMeta +): """Base class for PCA methods. Warning: This class should not be used directly. @@ -154,3 +156,8 @@ def inverse_transform(self, X): ) else: return np.dot(X, self.components_) + self.mean_ + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 3ade7727a8b7e..e4edaf31c9c32 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -14,7 +14,7 @@ from scipy import linalg from joblib import Parallel, effective_n_jobs -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..utils import deprecated from ..utils import check_array, check_random_state, gen_even_slices, gen_batches from ..utils.extmath import randomized_svd, row_norms, svd_flip @@ -1014,7 +1014,7 @@ def dict_learning_online( return dictionary -class _BaseSparseCoding(TransformerMixin): +class _BaseSparseCoding(_ClassNamePrefixFeaturesOutMixin, TransformerMixin): """Base class from SparseCoder and DictionaryLearning algorithms.""" def __init__( @@ -1315,6 +1315,11 @@ def n_features_in_(self): """Number of features seen during `fit`.""" return self.dictionary.shape[1] + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.n_components_ + class DictionaryLearning(_BaseSparseCoding, BaseEstimator): """Dictionary learning. @@ -1587,6 +1592,11 @@ def fit(self, X, y=None): self.error_ = E return self + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] + class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): """Mini-batch dictionary learning. @@ -1926,3 +1936,8 @@ def partial_fit(self, X, y=None, iter_offset=None): self.inner_stats_ = (A, B) self.iter_offset_ = iter_offset + 1 return self + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index fcf96cb0eb532..8ff5b54d4e839 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -25,14 +25,14 @@ from scipy import linalg -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..utils import check_random_state from ..utils.extmath import fast_logdet, randomized_svd, squared_norm from ..utils.validation import check_is_fitted from ..exceptions import ConvergenceWarning -class FactorAnalysis(TransformerMixin, BaseEstimator): +class FactorAnalysis(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """Factor Analysis (FA). A simple linear generative model with Gaussian latent variables. @@ -426,6 +426,11 @@ def _rotate(self, components, n_components=None, tol=1e-6): else: raise ValueError("'method' must be in %s, not %s" % (implemented, method)) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] + def _ortho_rotation(components, method="varimax", tol=1e-6, max_iter=100): """Return rotated components.""" diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 9d4bcc9026926..6eb10ce59505b 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -14,7 +14,7 @@ import numpy as np from scipy import linalg -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, as_float_array, check_random_state @@ -319,7 +319,7 @@ def my_g(x): return None, est._unmixing, sources -class FastICA(TransformerMixin, BaseEstimator): +class FastICA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """FastICA: a fast algorithm for Independent Component Analysis. The implementation is based on [1]_. @@ -689,3 +689,8 @@ def inverse_transform(self, X, copy=True): X += self.mean_ return X + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index aee9b46899cd6..ff0fd223512c1 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -10,15 +10,18 @@ from ..utils._arpack import _init_arpack_v0 from ..utils.extmath import svd_flip, _randomized_eigsh -from ..utils.validation import check_is_fitted, _check_psd_eigenvalues +from ..utils.validation import ( + check_is_fitted, + _check_psd_eigenvalues, +) from ..utils.deprecation import deprecated from ..exceptions import NotFittedError -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..preprocessing import KernelCenterer from ..metrics.pairwise import pairwise_kernels -class KernelPCA(TransformerMixin, BaseEstimator): +class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """Kernel Principal component analysis (KPCA). Non-linear dimensionality reduction through the use of kernels (see @@ -546,3 +549,8 @@ def _more_tags(self): "preserves_dtype": [np.float64, np.float32], "pairwise": self.kernel == "precomputed", } + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.eigenvalues_.shape[0] diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index a723e3451e24f..6db9d900566eb 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -16,7 +16,7 @@ from scipy.special import gammaln, logsumexp from joblib import Parallel, effective_n_jobs -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..utils import check_random_state, gen_batches, gen_even_slices from ..utils.validation import check_non_negative from ..utils.validation import check_is_fitted @@ -138,7 +138,9 @@ def _update_doc_distribution( return (doc_topic_distr, suff_stats) -class LatentDirichletAllocation(TransformerMixin, BaseEstimator): +class LatentDirichletAllocation( + _ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator +): """Latent Dirichlet Allocation with online variational Bayes algorithm. The implementation is based on [1]_ and [2]_. @@ -887,3 +889,8 @@ def perplexity(self, X, sub_sampling=False): X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity" ) return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling) + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index d914bd5b6126d..e388fdd45c201 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -15,11 +15,14 @@ from ._cdnmf_fast import _update_cdnmf_fast from .._config import config_context -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..exceptions import ConvergenceWarning from ..utils import check_random_state, check_array from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm -from ..utils.validation import check_is_fitted, check_non_negative +from ..utils.validation import ( + check_is_fitted, + check_non_negative, +) EPSILON = np.finfo(np.float32).eps @@ -1109,7 +1112,7 @@ def non_negative_factorization( return W, H, n_iter -class NMF(TransformerMixin, BaseEstimator): +class NMF(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """Non-Negative Matrix Factorization (NMF). Find two non-negative matrices (W, H) whose product approximates the non- @@ -1708,3 +1711,8 @@ def inverse_transform(self, W): """ check_is_fitted(self) return np.dot(W, self.components_) + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 1d7b4d6dfc063..31c8d2168a3e6 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -7,11 +7,11 @@ from ..utils import check_random_state from ..utils.validation import check_is_fitted from ..linear_model import ridge_regression -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ._dict_learning import dict_learning, dict_learning_online -class SparsePCA(TransformerMixin, BaseEstimator): +class SparsePCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """Sparse Principal Components Analysis (SparsePCA). Finds the set of sparse components that can optimally reconstruct @@ -236,6 +236,11 @@ def transform(self, X): return U + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] + class MiniBatchSparsePCA(SparsePCA): """Mini-batch Sparse Principal Components Analysis. diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 21ed87eca5fd1..01d79f742302f 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -10,7 +10,7 @@ import scipy.sparse as sp from scipy.sparse.linalg import svds -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..utils import check_array, check_random_state from ..utils._arpack import _init_arpack_v0 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip @@ -21,7 +21,7 @@ __all__ = ["TruncatedSVD"] -class TruncatedSVD(TransformerMixin, BaseEstimator): +class TruncatedSVD(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """Dimensionality reduction using truncated SVD (aka LSA). This transformer performs linear dimensionality reduction by means of @@ -273,3 +273,8 @@ def inverse_transform(self, X): def _more_tags(self): return {"preserves_dtype": [np.float64, np.float32]} + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index 1270287ec844a..9ce477fffcd9d 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -664,3 +664,21 @@ def test_warning_default_transform_alpha(Estimator): dl = Estimator(alpha=0.1) with pytest.warns(FutureWarning, match="default transform_alpha"): dl.fit_transform(X) + + +@pytest.mark.parametrize( + "estimator", + [SparseCoder(X.T), DictionaryLearning(), MiniBatchDictionaryLearning()], + ids=lambda x: x.__class__.__name__, +) +def test_get_feature_names_out(estimator): + """Check feature names for dict learning estimators.""" + estimator.fit(X) + n_components = X.shape[1] + + feature_names_out = estimator.get_feature_names_out() + estimator_name = estimator.__class__.__name__.lower() + assert_array_equal( + feature_names_out, + [f"{estimator_name}{i}" for i in range(n_components)], + ) diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index 756300d970072..2ae2187452eee 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -5,6 +5,7 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose_dense_sparse +from numpy.testing import assert_array_equal from sklearn import datasets from sklearn.decomposition import PCA, IncrementalPCA @@ -427,3 +428,11 @@ def test_incremental_pca_fit_overflow_error(): pca.fit(A) np.testing.assert_allclose(ipca.singular_values_, pca.singular_values_) + + +def test_incremental_pca_feature_names_out(): + """Check feature names out for IncrementalPCA.""" + ipca = IncrementalPCA(n_components=2).fit(iris.data) + + names = ipca.get_feature_names_out() + assert_array_equal([f"incrementalpca{i}" for i in range(2)], names) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index e7ae53fd5188b..72b40ec83e308 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -559,3 +559,12 @@ def test_kernel_pca_alphas_deprecated(): msg = r"Attribute `alphas_` was deprecated in version 1\.0" with pytest.warns(FutureWarning, match=msg): kp.alphas_ + + +def test_kernel_pca_feature_names_out(): + """Check feature names out for KernelPCA.""" + X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0) + kpca = KernelPCA(n_components=2).fit(X) + + names = kpca.get_feature_names_out() + assert_array_equal([f"kernelpca{i}" for i in range(2)], names) diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py index 3b056bf9ee0b1..19eecdbba99e0 100644 --- a/sklearn/decomposition/tests/test_nmf.py +++ b/sklearn/decomposition/tests/test_nmf.py @@ -741,3 +741,13 @@ def test_init_default_deprecation(): NMF().fit(A) with pytest.warns(FutureWarning, match=msg): non_negative_factorization(A) + + +def test_feature_names_out(): + """Check feature names out for NMF.""" + random_state = np.random.RandomState(0) + X = np.abs(random_state.randn(10, 4)) + nmf = NMF(n_components=3, init="nndsvda").fit(X) + + names = nmf.get_feature_names_out() + assert_array_equal([f"nmf{i}" for i in range(3)], names) diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index 811f3186ce503..e3ce951f7b6da 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -4,6 +4,7 @@ from scipy.linalg import block_diag from scipy.sparse import csr_matrix from scipy.special import psi +from numpy.testing import assert_array_equal import pytest @@ -427,3 +428,14 @@ def check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexiti ) def test_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities): check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities) + + +def test_lda_feature_names_out(): + """Check feature names out for LatentDirichletAllocation.""" + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components).fit(X) + + names = lda.get_feature_names_out() + assert_array_equal( + [f"latentdirichletallocation{i}" for i in range(n_components)], names + ) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index e7973fd8aa3af..95b790616c02e 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -1,5 +1,6 @@ import numpy as np import scipy as sp +from numpy.testing import assert_array_equal import pytest @@ -656,3 +657,11 @@ def test_assess_dimesion_rank_one(): assert np.isfinite(_assess_dimension(s, rank=1, n_samples=n_samples)) for rank in range(2, n_features): assert _assess_dimension(s, rank, n_samples) == -np.inf + + +def test_feature_names_out(): + """Check feature names out for PCA.""" + pca = PCA(n_components=2).fit(iris.data) + + names = pca.get_feature_names_out() + assert_array_equal([f"pca{i}" for i in range(2)], names) diff --git a/sklearn/decomposition/tests/test_sparse_pca.py b/sklearn/decomposition/tests/test_sparse_pca.py index 79ad3d0e6006f..c77aabf9c182c 100644 --- a/sklearn/decomposition/tests/test_sparse_pca.py +++ b/sklearn/decomposition/tests/test_sparse_pca.py @@ -5,6 +5,7 @@ import pytest import numpy as np +from numpy.testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose @@ -203,3 +204,17 @@ def test_spca_n_components_(SPCA, n_components): assert model.n_components_ == n_components else: assert model.n_components_ == n_features + + +@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA]) +def test_spca_feature_names_out(SPCA): + """Check feature names out for *SparsePCA.""" + rng = np.random.RandomState(0) + n_samples, n_features = 12, 10 + X = rng.randn(n_samples, n_features) + + model = SPCA(n_components=4).fit(X) + names = model.get_feature_names_out() + + estimator_name = SPCA.__name__.lower() + assert_array_equal([f"{estimator_name}{i}" for i in range(4)], names) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index e06a060546713..01fc98deeea91 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -363,7 +363,6 @@ def test_pandas_column_name_consistency(estimator): GET_FEATURES_OUT_MODULES_TO_IGNORE = [ "cluster", "cross_decomposition", - "decomposition", "discriminant_analysis", "ensemble", "isotonic", diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 0380af76f5140..7662f03f85ce5 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1702,7 +1702,7 @@ def _get_feature_names(X): return feature_names -def _check_feature_names_in(estimator, input_features=None): +def _check_feature_names_in(estimator, input_features=None, *, generate_names=True): """Get output feature names for transformation. Parameters @@ -1716,9 +1716,13 @@ def _check_feature_names_in(estimator, input_features=None): - If `input_features` is an array-like, then `input_features` must match `feature_names_in_` if `feature_names_in_` is defined. + generate_names : bool, default=True + Wether to generate names when `input_features` is `None` and + `estimator.feature_names_in_` is not defined. + Returns ------- - feature_names_in : ndarray of str + feature_names_in : ndarray of str or `None` Feature names in. """ @@ -1742,8 +1746,40 @@ def _check_feature_names_in(estimator, input_features=None): if feature_names_in_ is not None: return feature_names_in_ + if not generate_names: + return + # Generates feature names if `n_features_in_` is defined if n_features_in_ is None: raise ValueError("Unable to generate feature names without n_features_in_") return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object) + + +def _generate_get_feature_names_out(estimator, n_features_out, input_features=None): + """Generate feature names out for estimator using the estimator name as the prefix. + + The input_feature names are validated but not used. This function is useful + for estimators that generate their own names based on `n_features_out`, i.e. PCA. + + Parameters + ---------- + estimator : estimator instance + Estimator producing output feature names. + + n_feature_out : int + Number of feature names out. + + input_features : array-like of str or None, default=None + Only used to validate feature names with `estimator.feature_names_in_`. + + Returns + ------- + feature_names_in : ndarray of str or `None` + Feature names in. + """ + _check_feature_names_in(estimator, input_features, generate_names=False) + estimator_name = estimator.__class__.__name__.lower() + return np.asarray( + [f"{estimator_name}{i}" for i in range(n_features_out)], dtype=object + )