From eb3e5ae30878af83d2ab939f49f60fedd3ff3abf Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 14 Oct 2021 16:02:41 -0400 Subject: [PATCH 1/5] ENH Adds feature names out to decomposition module --- sklearn/decomposition/_base.py | 18 +++++++ sklearn/decomposition/_dict_learning.py | 50 +++++++++++++++++++ sklearn/decomposition/_factor_analysis.py | 19 ++++++- sklearn/decomposition/_fastica.py | 18 +++++++ sklearn/decomposition/_kernel_pca.py | 23 ++++++++- sklearn/decomposition/_lda.py | 18 +++++++ sklearn/decomposition/_nmf.py | 23 ++++++++- sklearn/decomposition/_sparse_pca.py | 18 +++++++ sklearn/decomposition/_truncated_svd.py | 19 ++++++- .../decomposition/tests/test_dict_learning.py | 18 +++++++ .../tests/test_incremental_pca.py | 9 ++++ .../decomposition/tests/test_kernel_pca.py | 9 ++++ sklearn/decomposition/tests/test_nmf.py | 10 ++++ .../decomposition/tests/test_online_lda.py | 12 +++++ sklearn/decomposition/tests/test_pca.py | 9 ++++ .../decomposition/tests/test_sparse_pca.py | 15 ++++++ sklearn/feature_extraction/_hash.py | 16 ++++++ .../tests/test_feature_hasher.py | 8 +++ sklearn/tests/test_common.py | 1 - sklearn/utils/validation.py | 40 ++++++++++++++- 20 files changed, 346 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index e503a52ee1f92..07ca5e2d8ef17 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -13,6 +13,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils.validation import check_is_fitted +from ..utils.validation import _generate_get_feature_names_out from abc import ABCMeta, abstractmethod @@ -154,3 +155,20 @@ def inverse_transform(self, X): ) else: return np.dot(X, self.components_) + self.mean_ + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out( + self, self.components_.shape[0], input_features + ) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index a18adb6f1e3bc..ca46595a3e04e 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -19,6 +19,7 @@ from ..utils import check_array, check_random_state, gen_even_slices, gen_batches from ..utils.extmath import randomized_svd, row_norms, svd_flip from ..utils.validation import check_is_fitted +from ..utils.validation import _generate_get_feature_names_out from ..utils.fixes import delayed from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars @@ -1314,6 +1315,21 @@ def n_features_in_(self): """Number of features seen during `fit`.""" return self.dictionary.shape[1] + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out(self, self.n_components_, input_features) + class DictionaryLearning(_BaseSparseCoding, BaseEstimator): """Dictionary learning. @@ -1585,6 +1601,23 @@ def fit(self, X, y=None): self.error_ = E return self + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out( + self, self.components_.shape[0], input_features + ) + class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): """Mini-batch dictionary learning. @@ -1924,3 +1957,20 @@ def partial_fit(self, X, y=None, iter_offset=None): self.inner_stats_ = (A, B) self.iter_offset_ = iter_offset + 1 return self + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out( + self, self.components_.shape[0], input_features + ) diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index fcf96cb0eb532..3953c3cb5b8e4 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -28,7 +28,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_random_state from ..utils.extmath import fast_logdet, randomized_svd, squared_norm -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _generate_get_feature_names_out from ..exceptions import ConvergenceWarning @@ -426,6 +426,23 @@ def _rotate(self, components, n_components=None, tol=1e-6): else: raise ValueError("'method' must be in %s, not %s" % (implemented, method)) + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out( + self, self.components_.shape[0], input_features + ) + def _ortho_rotation(components, method="varimax", tol=1e-6, max_iter=100): """Return rotated components.""" diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 97a84d56cd212..0efbab48c9041 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -20,6 +20,7 @@ from ..utils import check_array, as_float_array, check_random_state from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES +from ..utils.validation import _generate_get_feature_names_out __all__ = ["fastica", "FastICA"] @@ -686,3 +687,20 @@ def inverse_transform(self, X, copy=True): X += self.mean_ return X + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out( + self, self.components_.shape[0], input_features + ) diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index aee9b46899cd6..17cede1280072 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -10,7 +10,11 @@ from ..utils._arpack import _init_arpack_v0 from ..utils.extmath import svd_flip, _randomized_eigsh -from ..utils.validation import check_is_fitted, _check_psd_eigenvalues +from ..utils.validation import ( + check_is_fitted, + _check_psd_eigenvalues, + _generate_get_feature_names_out, +) from ..utils.deprecation import deprecated from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin @@ -546,3 +550,20 @@ def _more_tags(self): "preserves_dtype": [np.float64, np.float32], "pairwise": self.kernel == "precomputed", } + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out( + self, self.eigenvalues_.shape[0], input_features + ) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 99ecdd0317e89..3413e6770225a 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -20,6 +20,7 @@ from ..utils import check_random_state, gen_batches, gen_even_slices from ..utils.validation import check_non_negative from ..utils.validation import check_is_fitted +from ..utils.validation import _generate_get_feature_names_out from ..utils.fixes import delayed from ._online_lda_fast import ( @@ -903,3 +904,20 @@ def perplexity(self, X, sub_sampling=False): Perplexity score. """ return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out( + self, self.components_.shape[0], input_features + ) diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index d914bd5b6126d..46f4aaa5ff82c 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -19,7 +19,11 @@ from ..exceptions import ConvergenceWarning from ..utils import check_random_state, check_array from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm -from ..utils.validation import check_is_fitted, check_non_negative +from ..utils.validation import ( + check_is_fitted, + check_non_negative, + _generate_get_feature_names_out, +) EPSILON = np.finfo(np.float32).eps @@ -1708,3 +1712,20 @@ def inverse_transform(self, W): """ check_is_fitted(self) return np.dot(W, self.components_) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out( + self, self.components_.shape[0], input_features + ) diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 37ef6b556bd25..fc44bb7fdaf66 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -6,6 +6,7 @@ from ..utils import check_random_state from ..utils.validation import check_is_fitted +from ..utils.validation import _generate_get_feature_names_out from ..linear_model import ridge_regression from ..base import BaseEstimator, TransformerMixin from ._dict_learning import dict_learning, dict_learning_online @@ -235,6 +236,23 @@ def transform(self, X): return U + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out( + self, self.components_.shape[0], input_features + ) + class MiniBatchSparsePCA(SparsePCA): """Mini-batch Sparse Principal Components Analysis. diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 21ed87eca5fd1..0abef61ace0ca 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -15,7 +15,7 @@ from ..utils._arpack import _init_arpack_v0 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip from ..utils.sparsefuncs import mean_variance_axis -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _generate_get_feature_names_out __all__ = ["TruncatedSVD"] @@ -273,3 +273,20 @@ def inverse_transform(self, X): def _more_tags(self): return {"preserves_dtype": [np.float64, np.float32]} + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out( + self, self.components_.shape[0], input_features + ) diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index 1270287ec844a..9ce477fffcd9d 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -664,3 +664,21 @@ def test_warning_default_transform_alpha(Estimator): dl = Estimator(alpha=0.1) with pytest.warns(FutureWarning, match="default transform_alpha"): dl.fit_transform(X) + + +@pytest.mark.parametrize( + "estimator", + [SparseCoder(X.T), DictionaryLearning(), MiniBatchDictionaryLearning()], + ids=lambda x: x.__class__.__name__, +) +def test_get_feature_names_out(estimator): + """Check feature names for dict learning estimators.""" + estimator.fit(X) + n_components = X.shape[1] + + feature_names_out = estimator.get_feature_names_out() + estimator_name = estimator.__class__.__name__.lower() + assert_array_equal( + feature_names_out, + [f"{estimator_name}{i}" for i in range(n_components)], + ) diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index 756300d970072..2ae2187452eee 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -5,6 +5,7 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose_dense_sparse +from numpy.testing import assert_array_equal from sklearn import datasets from sklearn.decomposition import PCA, IncrementalPCA @@ -427,3 +428,11 @@ def test_incremental_pca_fit_overflow_error(): pca.fit(A) np.testing.assert_allclose(ipca.singular_values_, pca.singular_values_) + + +def test_incremental_pca_feature_names_out(): + """Check feature names out for IncrementalPCA.""" + ipca = IncrementalPCA(n_components=2).fit(iris.data) + + names = ipca.get_feature_names_out() + assert_array_equal([f"incrementalpca{i}" for i in range(2)], names) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index e7ae53fd5188b..72b40ec83e308 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -559,3 +559,12 @@ def test_kernel_pca_alphas_deprecated(): msg = r"Attribute `alphas_` was deprecated in version 1\.0" with pytest.warns(FutureWarning, match=msg): kp.alphas_ + + +def test_kernel_pca_feature_names_out(): + """Check feature names out for KernelPCA.""" + X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0) + kpca = KernelPCA(n_components=2).fit(X) + + names = kpca.get_feature_names_out() + assert_array_equal([f"kernelpca{i}" for i in range(2)], names) diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py index 3b056bf9ee0b1..19eecdbba99e0 100644 --- a/sklearn/decomposition/tests/test_nmf.py +++ b/sklearn/decomposition/tests/test_nmf.py @@ -741,3 +741,13 @@ def test_init_default_deprecation(): NMF().fit(A) with pytest.warns(FutureWarning, match=msg): non_negative_factorization(A) + + +def test_feature_names_out(): + """Check feature names out for NMF.""" + random_state = np.random.RandomState(0) + X = np.abs(random_state.randn(10, 4)) + nmf = NMF(n_components=3, init="nndsvda").fit(X) + + names = nmf.get_feature_names_out() + assert_array_equal([f"nmf{i}" for i in range(3)], names) diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index 811f3186ce503..e3ce951f7b6da 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -4,6 +4,7 @@ from scipy.linalg import block_diag from scipy.sparse import csr_matrix from scipy.special import psi +from numpy.testing import assert_array_equal import pytest @@ -427,3 +428,14 @@ def check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexiti ) def test_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities): check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities) + + +def test_lda_feature_names_out(): + """Check feature names out for LatentDirichletAllocation.""" + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components).fit(X) + + names = lda.get_feature_names_out() + assert_array_equal( + [f"latentdirichletallocation{i}" for i in range(n_components)], names + ) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index e7973fd8aa3af..95b790616c02e 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -1,5 +1,6 @@ import numpy as np import scipy as sp +from numpy.testing import assert_array_equal import pytest @@ -656,3 +657,11 @@ def test_assess_dimesion_rank_one(): assert np.isfinite(_assess_dimension(s, rank=1, n_samples=n_samples)) for rank in range(2, n_features): assert _assess_dimension(s, rank, n_samples) == -np.inf + + +def test_feature_names_out(): + """Check feature names out for PCA.""" + pca = PCA(n_components=2).fit(iris.data) + + names = pca.get_feature_names_out() + assert_array_equal([f"pca{i}" for i in range(2)], names) diff --git a/sklearn/decomposition/tests/test_sparse_pca.py b/sklearn/decomposition/tests/test_sparse_pca.py index 79ad3d0e6006f..c77aabf9c182c 100644 --- a/sklearn/decomposition/tests/test_sparse_pca.py +++ b/sklearn/decomposition/tests/test_sparse_pca.py @@ -5,6 +5,7 @@ import pytest import numpy as np +from numpy.testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose @@ -203,3 +204,17 @@ def test_spca_n_components_(SPCA, n_components): assert model.n_components_ == n_components else: assert model.n_components_ == n_features + + +@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA]) +def test_spca_feature_names_out(SPCA): + """Check feature names out for *SparsePCA.""" + rng = np.random.RandomState(0) + n_samples, n_features = 12, 10 + X = rng.randn(n_samples, n_features) + + model = SPCA(n_components=4).fit(X) + names = model.get_feature_names_out() + + estimator_name = SPCA.__name__.lower() + assert_array_equal([f"{estimator_name}{i}" for i in range(4)], names) diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index bd7cb0f371244..99aa9c2b9b503 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -7,6 +7,7 @@ import scipy.sparse as sp from ..utils import IS_PYPY +from ..utils.validation import _generate_get_feature_names_out from ..base import BaseEstimator, TransformerMixin if not IS_PYPY: @@ -188,3 +189,18 @@ def transform(self, raw_X): def _more_tags(self): return {"X_types": [self.input_type]} + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + return _generate_get_feature_names_out(self, self.n_features) diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index debc65ec925b8..edae0bb519d8e 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -171,3 +171,11 @@ def test_hash_collisions(): alternate_sign=False, n_features=1, input_type="string" ).fit_transform(X) assert Xt.data[0] == len(X[0]) + + +def test_feature_names_out(): + """Check feature names out for FeatureHasher.""" + hasher = FeatureHasher(n_features=16) + + names = hasher.get_feature_names_out() + assert_array_equal([f"featurehasher{i}" for i in range(16)], names) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 4f6818081c67d..920c21e56e6af 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -362,7 +362,6 @@ def test_pandas_column_name_consistency(estimator): GET_FEATURES_OUT_MODULES_TO_IGNORE = [ "cluster", "cross_decomposition", - "decomposition", "discriminant_analysis", "ensemble", "impute", diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index a2693a44a9f8b..4a916ec6d9259 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1695,7 +1695,7 @@ def _get_feature_names(X): return feature_names -def _check_feature_names_in(estimator, input_features=None): +def _check_feature_names_in(estimator, input_features=None, *, generate_names=True): """Get output feature names for transformation. Parameters @@ -1709,9 +1709,13 @@ def _check_feature_names_in(estimator, input_features=None): - If `input_features` is an array-like, then `input_features` must match `feature_names_in_` if `feature_names_in_` is defined. + generate_names : bool, default=True + Wether to generate names when `input_features` is `None` and + `estimator.feature_names_in_` is not defined. + Returns ------- - feature_names_in : ndarray of str + feature_names_in : ndarray of str or `None` Feature names in. """ @@ -1735,8 +1739,40 @@ def _check_feature_names_in(estimator, input_features=None): if feature_names_in_ is not None: return feature_names_in_ + if not generate_names: + return + # Generates feature names if `n_features_in_` is defined if n_features_in_ is None: raise ValueError("Unable to generate feature names without n_features_in_") return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object) + + +def _generate_get_feature_names_out(estimator, n_features_out, input_features=None): + """Generate feature names out for estimator using the estimator name as the prefix. + + The input_feature names are validated but not used. This function is useful + for estimators that generate their own names based on `n_features_out`, i.e. PCA. + + Parameters + ---------- + estimator : estimator instance + Estimator producing output feature names. + + n_feature_out : int + Number of feature names out. + + input_features : array-like of str or None, default=None + Only used to validate feature names with `estimator.feature_names_in_`. + + Returns + ------- + feature_names_in : ndarray of str or `None` + Feature names in. + """ + _check_feature_names_in(estimator, input_features, generate_names=False) + estimator_name = estimator.__class__.__name__.lower() + return np.asarray( + [f"{estimator_name}{i}" for i in range(n_features_out)], dtype=object + ) From 04bcacb4e6790e4ce0a54ca67e2c8ee2b3e711e6 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 14 Oct 2021 16:22:53 -0400 Subject: [PATCH 2/5] DOC Adds whats enw --- doc/whats_new/v1.1.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index d75ef4f388110..c77596f24aa69 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -52,6 +52,25 @@ Changelog reconstruction of a `X` target when a `Y` parameter is given. :pr:`19680` by :user:`Robin Thibaut `. +:mod:`sklearn.decomposition` +.................................. + +- |API| Adds :term:`get_feature_names_out` to all transformers in the + :mod:`~sklearn.decomposition` module: + :class:`~sklearn.decomposition.DictionaryLearning`, + :class:`~sklearn.decomposition.FactorAnalysis`, + :class:`~sklearn.decomposition.FastICA`, + :class:`~sklearn.decomposition.IncrementalPCA`, + :class:`~sklearn.decomposition.KernelPCA`, + :class:`~sklearn.decomposition.LatentDirichletAllocation`, + :class:`~sklearn.decomposition.MiniBatchDictionaryLearning`, + :class:`~sklearn.decomposition.MiniBatchSparsePCA`, + :class:`~sklearn.decomposition.NMF`, + :class:`~sklearn.decomposition.PCA`, + :class:`~sklearn.decomposition.SparsePCA`, + and :class:`~sklearn.decomposition.TruncatedSVD`. :pr:`21334` by + `Thomas Fan`_. + :mod:`sklearn.ensemble` ....................... From 92c41204d3cca6a96a5cfb1daf72040881cdc264 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 14 Oct 2021 16:23:41 -0400 Subject: [PATCH 3/5] REV Removes module not in decomposition --- sklearn/feature_extraction/_hash.py | 16 ---------------- .../tests/test_feature_hasher.py | 8 -------- 2 files changed, 24 deletions(-) diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index 99aa9c2b9b503..bd7cb0f371244 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -7,7 +7,6 @@ import scipy.sparse as sp from ..utils import IS_PYPY -from ..utils.validation import _generate_get_feature_names_out from ..base import BaseEstimator, TransformerMixin if not IS_PYPY: @@ -189,18 +188,3 @@ def transform(self, raw_X): def _more_tags(self): return {"X_types": [self.input_type]} - - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out(self, self.n_features) diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index edae0bb519d8e..debc65ec925b8 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -171,11 +171,3 @@ def test_hash_collisions(): alternate_sign=False, n_features=1, input_type="string" ).fit_transform(X) assert Xt.data[0] == len(X[0]) - - -def test_feature_names_out(): - """Check feature names out for FeatureHasher.""" - hasher = FeatureHasher(n_features=16) - - names = hasher.get_feature_names_out() - assert_array_equal([f"featurehasher{i}" for i in range(16)], names) From e3377422845de02793c54d32cb5f5836e301d1ce Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 20 Oct 2021 15:49:04 -0400 Subject: [PATCH 4/5] ENH Uses a mixin --- sklearn/base.py | 27 ++++++++++ sklearn/decomposition/_base.py | 27 +++------- sklearn/decomposition/_dict_learning.py | 63 +++++------------------ sklearn/decomposition/_factor_analysis.py | 26 +++------- sklearn/decomposition/_fastica.py | 25 +++------ sklearn/decomposition/_kernel_pca.py | 25 +++------ sklearn/decomposition/_lda.py | 27 +++------- sklearn/decomposition/_nmf.py | 25 +++------ sklearn/decomposition/_sparse_pca.py | 25 +++------ sklearn/decomposition/_truncated_svd.py | 26 +++------- 10 files changed, 95 insertions(+), 201 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 60fc82eff6088..58c7952e0291b 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -24,6 +24,8 @@ from .utils.validation import _check_y from .utils.validation import _num_features from .utils.validation import _check_feature_names_in +from .utils.validation import _generate_get_feature_names_out +from .utils.validation import check_is_fitted from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _get_feature_names @@ -876,6 +878,31 @@ def get_feature_names_out(self, input_features=None): return _check_feature_names_in(self, input_features) +class _ClassNamePrefixFeaturesOutMixin: + """Mixin class for transformers that generate their own names by prefixing. + + Assumes that `_n_features_out` is defined for the estimator. + """ + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "_n_features_out") + return _generate_get_feature_names_out( + self, self._n_features_out, input_features=input_features + ) + + class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index 07ca5e2d8ef17..7904ce17f7212 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -11,13 +11,14 @@ import numpy as np from scipy import linalg -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..utils.validation import check_is_fitted -from ..utils.validation import _generate_get_feature_names_out from abc import ABCMeta, abstractmethod -class _BasePCA(TransformerMixin, BaseEstimator, metaclass=ABCMeta): +class _BasePCA( + _ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, metaclass=ABCMeta +): """Base class for PCA methods. Warning: This class should not be used directly. @@ -156,19 +157,7 @@ def inverse_transform(self, X): else: return np.dot(X, self.components_) + self.mean_ - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Only used to validate feature names with the names seen in :meth:`fit`. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out( - self, self.components_.shape[0], input_features - ) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index ca46595a3e04e..1133820f04979 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -14,12 +14,11 @@ from scipy import linalg from joblib import Parallel, effective_n_jobs -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..utils import deprecated from ..utils import check_array, check_random_state, gen_even_slices, gen_batches from ..utils.extmath import randomized_svd, row_norms, svd_flip from ..utils.validation import check_is_fitted -from ..utils.validation import _generate_get_feature_names_out from ..utils.fixes import delayed from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars @@ -1014,7 +1013,7 @@ def dict_learning_online( return dictionary -class _BaseSparseCoding(TransformerMixin): +class _BaseSparseCoding(_ClassNamePrefixFeaturesOutMixin, TransformerMixin): """Base class from SparseCoder and DictionaryLearning algorithms.""" def __init__( @@ -1315,20 +1314,10 @@ def n_features_in_(self): """Number of features seen during `fit`.""" return self.dictionary.shape[1] - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Only used to validate feature names with the names seen in :meth:`fit`. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out(self, self.n_components_, input_features) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.n_components_ class DictionaryLearning(_BaseSparseCoding, BaseEstimator): @@ -1601,22 +1590,10 @@ def fit(self, X, y=None): self.error_ = E return self - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Only used to validate feature names with the names seen in :meth:`fit`. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out( - self, self.components_.shape[0], input_features - ) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): @@ -1958,19 +1935,7 @@ def partial_fit(self, X, y=None, iter_offset=None): self.iter_offset_ = iter_offset + 1 return self - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Only used to validate feature names with the names seen in :meth:`fit`. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out( - self, self.components_.shape[0], input_features - ) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index 3953c3cb5b8e4..8ff5b54d4e839 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -25,14 +25,14 @@ from scipy import linalg -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..utils import check_random_state from ..utils.extmath import fast_logdet, randomized_svd, squared_norm -from ..utils.validation import check_is_fitted, _generate_get_feature_names_out +from ..utils.validation import check_is_fitted from ..exceptions import ConvergenceWarning -class FactorAnalysis(TransformerMixin, BaseEstimator): +class FactorAnalysis(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """Factor Analysis (FA). A simple linear generative model with Gaussian latent variables. @@ -426,22 +426,10 @@ def _rotate(self, components, n_components=None, tol=1e-6): else: raise ValueError("'method' must be in %s, not %s" % (implemented, method)) - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Only used to validate feature names with the names seen in :meth:`fit`. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out( - self, self.components_.shape[0], input_features - ) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] def _ortho_rotation(components, method="varimax", tol=1e-6, max_iter=100): diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 0efbab48c9041..2167e54d0f443 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -14,13 +14,12 @@ import numpy as np from scipy import linalg -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, as_float_array, check_random_state from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES -from ..utils.validation import _generate_get_feature_names_out __all__ = ["fastica", "FastICA"] @@ -320,7 +319,7 @@ def my_g(x): return None, est._unmixing, sources -class FastICA(TransformerMixin, BaseEstimator): +class FastICA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """FastICA: a fast algorithm for Independent Component Analysis. The implementation is based on [1]_. @@ -688,19 +687,7 @@ def inverse_transform(self, X, copy=True): return X - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Only used to validate feature names with the names seen in :meth:`fit`. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out( - self, self.components_.shape[0], input_features - ) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 17cede1280072..ff0fd223512c1 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -13,16 +13,15 @@ from ..utils.validation import ( check_is_fitted, _check_psd_eigenvalues, - _generate_get_feature_names_out, ) from ..utils.deprecation import deprecated from ..exceptions import NotFittedError -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..preprocessing import KernelCenterer from ..metrics.pairwise import pairwise_kernels -class KernelPCA(TransformerMixin, BaseEstimator): +class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """Kernel Principal component analysis (KPCA). Non-linear dimensionality reduction through the use of kernels (see @@ -551,19 +550,7 @@ def _more_tags(self): "pairwise": self.kernel == "precomputed", } - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Only used to validate feature names with the names seen in :meth:`fit`. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out( - self, self.eigenvalues_.shape[0], input_features - ) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.eigenvalues_.shape[0] diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 3413e6770225a..a760d4139ee62 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -16,11 +16,10 @@ from scipy.special import gammaln, logsumexp from joblib import Parallel, effective_n_jobs -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..utils import check_random_state, gen_batches, gen_even_slices from ..utils.validation import check_non_negative from ..utils.validation import check_is_fitted -from ..utils.validation import _generate_get_feature_names_out from ..utils.fixes import delayed from ._online_lda_fast import ( @@ -139,7 +138,9 @@ def _update_doc_distribution( return (doc_topic_distr, suff_stats) -class LatentDirichletAllocation(TransformerMixin, BaseEstimator): +class LatentDirichletAllocation( + _ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator +): """Latent Dirichlet Allocation with online variational Bayes algorithm. The implementation is based on [1]_ and [2]_. @@ -905,19 +906,7 @@ def perplexity(self, X, sub_sampling=False): """ return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling) - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Only used to validate feature names with the names seen in :meth:`fit`. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out( - self, self.components_.shape[0], input_features - ) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 46f4aaa5ff82c..e388fdd45c201 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -15,14 +15,13 @@ from ._cdnmf_fast import _update_cdnmf_fast from .._config import config_context -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..exceptions import ConvergenceWarning from ..utils import check_random_state, check_array from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm from ..utils.validation import ( check_is_fitted, check_non_negative, - _generate_get_feature_names_out, ) EPSILON = np.finfo(np.float32).eps @@ -1113,7 +1112,7 @@ def non_negative_factorization( return W, H, n_iter -class NMF(TransformerMixin, BaseEstimator): +class NMF(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """Non-Negative Matrix Factorization (NMF). Find two non-negative matrices (W, H) whose product approximates the non- @@ -1713,19 +1712,7 @@ def inverse_transform(self, W): check_is_fitted(self) return np.dot(W, self.components_) - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Only used to validate feature names with the names seen in :meth:`fit`. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out( - self, self.components_.shape[0], input_features - ) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index fc44bb7fdaf66..da3c098b23980 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -6,13 +6,12 @@ from ..utils import check_random_state from ..utils.validation import check_is_fitted -from ..utils.validation import _generate_get_feature_names_out from ..linear_model import ridge_regression -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ._dict_learning import dict_learning, dict_learning_online -class SparsePCA(TransformerMixin, BaseEstimator): +class SparsePCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """Sparse Principal Components Analysis (SparsePCA). Finds the set of sparse components that can optimally reconstruct @@ -236,22 +235,10 @@ def transform(self, X): return U - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Only used to validate feature names with the names seen in :meth:`fit`. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out( - self, self.components_.shape[0], input_features - ) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] class MiniBatchSparsePCA(SparsePCA): diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 0abef61ace0ca..01d79f742302f 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -10,18 +10,18 @@ import scipy.sparse as sp from scipy.sparse.linalg import svds -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..utils import check_array, check_random_state from ..utils._arpack import _init_arpack_v0 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip from ..utils.sparsefuncs import mean_variance_axis -from ..utils.validation import check_is_fitted, _generate_get_feature_names_out +from ..utils.validation import check_is_fitted __all__ = ["TruncatedSVD"] -class TruncatedSVD(TransformerMixin, BaseEstimator): +class TruncatedSVD(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """Dimensionality reduction using truncated SVD (aka LSA). This transformer performs linear dimensionality reduction by means of @@ -274,19 +274,7 @@ def inverse_transform(self, X): def _more_tags(self): return {"preserves_dtype": [np.float64, np.float32]} - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Only used to validate feature names with the names seen in :meth:`fit`. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - return _generate_get_feature_names_out( - self, self.components_.shape[0], input_features - ) + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] From 30f3d6eb14193da28a363a1419bed7987af4c289 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 25 Oct 2021 22:20:12 -0400 Subject: [PATCH 5/5] REV Better merge --- doc/whats_new/v1.1.rst | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 6cda2077799ba..261b6f8eac6bf 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -53,7 +53,10 @@ Changelog :user:`Robin Thibaut `. :mod:`sklearn.decomposition` -.................................. +............................ + +- |Fix| :class:`decomposition.FastICA` now validates input parameters in `fit` instead of `__init__`. + :pr:`21432` by :user:`Hannah Bohle ` and :user:`Maren Westermann `. - |API| Adds :term:`get_feature_names_out` to all transformers in the :mod:`~sklearn.decomposition` module: @@ -71,21 +74,6 @@ Changelog and :class:`~sklearn.decomposition.TruncatedSVD`. :pr:`21334` by `Thomas Fan`_. -:mod:`sklearn.ensemble` -....................... - -- |Fix| Fixed a bug that could produce a segfault in rare cases for - :class:`ensemble.HistGradientBoostingClassifier` and - :class:`ensemble.HistGradientBoostingRegressor`. - :pr:`21130` :user:`Christian Lorentzen `. - -:mod:`sklearn.feature_extraction` -................................. -............................ - -- |Fix| :class:`decomposition.FastICA` now validates input parameters in `fit` instead of `__init__`. - :pr:`21432` by :user:`Hannah Bohle ` and :user:`Maren Westermann `. - :mod:`sklearn.impute` .....................