From ab2acbd29bcb88ef00b039dd75e5a45d1e59c17b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 10:52:05 -0500 Subject: [PATCH 001/100] work on get_feature_names for pipeline --- sklearn/base.py | 15 +++++++++++++++ sklearn/compose/_column_transformer.py | 8 ++++++-- sklearn/impute.py | 4 ++-- sklearn/pipeline.py | 15 +++++++++++++++ sklearn/preprocessing/data.py | 18 +++++++++--------- 5 files changed, 47 insertions(+), 13 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 34998270cea88..b474f774bcf1c 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -509,6 +509,21 @@ def fit_predict(self, X, y=None): return self.fit(X).predict(X) +class OneToOneMixin(object): + """Provides get_feature_names for simple transformers + + Assumes there's a 1-to-1 correspondence between input features + and output features. + """ + + def get_feature_names(self, input_features=None): + if input_features is not None: + return input_features + else: + raise ValueError("Don't know how to get" + " input feature names for {}".format(self)) + + ############################################################################### class MetaEstimatorMixin(object): """Mixin class for all meta estimators in scikit-learn.""" diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 990374c27affe..540b49fc0852b 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -325,7 +325,7 @@ def get_feature_names(self): """ check_is_fitted(self, 'transformers_') feature_names = [] - for name, trans, _, _ in self._iter(fitted=True): + for name, trans, columns, _ in self._iter(fitted=True): if trans == 'drop': continue elif trans == 'passthrough': @@ -336,8 +336,12 @@ def get_feature_names(self): raise AttributeError("Transformer %s (type %s) does not " "provide get_feature_names." % (str(name), type(trans).__name__)) + try: + more_names = trans.get_feature_names(input_features=columns) + except TypeError: + more_names = trans.get_feature_names() feature_names.extend([name + "__" + f for f in - trans.get_feature_names()]) + more_names]) return feature_names def _update_fitted_transformers(self, transformers): diff --git a/sklearn/impute.py b/sklearn/impute.py index a10f6c9eb947f..e55a7a7e19e57 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -11,7 +11,7 @@ from scipy import sparse from scipy import stats -from .base import BaseEstimator, TransformerMixin +from .base import BaseEstimator, TransformerMixin, OneToOneMixin from .utils import check_array from .utils.sparsefuncs import _get_median from .utils.validation import check_is_fitted @@ -90,7 +90,7 @@ def _most_frequent(array, extra_value, n_repeat): return extra_value -class SimpleImputer(BaseEstimator, TransformerMixin): +class SimpleImputer(BaseEstimator, TransformerMixin, OneToOneMixin): """Imputation transformer for completing missing values. Read more in the :ref:`User Guide `. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 3f69f5c18558f..0d370b2283a64 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -531,6 +531,21 @@ def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) + def get_feature_names(self, input_features=None): + feature_names = input_features + with_final = hasattr(self._final_estimator, "get_feature_names") + + for name, transform in self._iter(with_final=with_final): + if not hasattr(transform, "get_feature_names"): + raise TypeError("Transformer {} does provide" + " get_feature_names".format(name)) + try: + feature_names = transform.get_feature_names( + input_features=feature_names) + except TypeError: + feature_names = transform.get_feature_names() + return feature_names + def _name_estimators(estimators): """Generate names for estimators.""" diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 77c2d2cc970fc..d2167b683cdd9 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -19,7 +19,7 @@ from scipy import stats from scipy import optimize -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, OneToOneMixin from ..externals import six from ..utils import check_array from ..utils.extmath import row_norms @@ -199,7 +199,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True): return X -class MinMaxScaler(BaseEstimator, TransformerMixin): +class MinMaxScaler(BaseEstimator, TransformerMixin, OneToOneMixin): """Transforms features by scaling each feature to a given range. This estimator scales and translates each feature individually such @@ -477,7 +477,7 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True): return X -class StandardScaler(BaseEstimator, TransformerMixin): +class StandardScaler(BaseEstimator, TransformerMixin, OneToOneMixin): """Standardize features by removing the mean and scaling to unit variance The standard score of a sample `x` is calculated as: @@ -798,7 +798,7 @@ def inverse_transform(self, X, copy=None): return X -class MaxAbsScaler(BaseEstimator, TransformerMixin): +class MaxAbsScaler(BaseEstimator, TransformerMixin, OneToOneMixin): """Scale each feature by its maximum absolute value. This estimator scales and translates each feature individually such @@ -1024,7 +1024,7 @@ def maxabs_scale(X, axis=0, copy=True): return X -class RobustScaler(BaseEstimator, TransformerMixin): +class RobustScaler(BaseEstimator, TransformerMixin, OneToOneMixin): """Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to @@ -1619,7 +1619,7 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False): return X -class Normalizer(BaseEstimator, TransformerMixin): +class Normalizer(BaseEstimator, TransformerMixin, OneToOneMixin): """Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one @@ -1754,7 +1754,7 @@ def binarize(X, threshold=0.0, copy=True): return X -class Binarizer(BaseEstimator, TransformerMixin): +class Binarizer(BaseEstimator, TransformerMixin, OneToOneMixin): """Binarize data (set feature values to 0 or 1) according to a threshold Values greater than the threshold map to 1, while values less than @@ -1988,7 +1988,7 @@ def add_dummy_feature(X, value=1.0): return np.hstack((np.full((n_samples, 1), value), X)) -class QuantileTransformer(BaseEstimator, TransformerMixin): +class QuantileTransformer(BaseEstimator, TransformerMixin, OneToOneMixin): """Transform features using quantiles information. This method transforms the features to follow a uniform or a normal @@ -2488,7 +2488,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, " axis={}".format(axis)) -class PowerTransformer(BaseEstimator, TransformerMixin): +class PowerTransformer(BaseEstimator, TransformerMixin, OneToOneMixin): """Apply a power transform featurewise to make data more Gaussian-like. Power transforms are a family of parametric, monotonic transformations From 3bc674b5cd24b203d04a35ca3ae552fcd8e094b9 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 13:27:20 -0500 Subject: [PATCH 002/100] fix SimpleImputer get_feature_names --- sklearn/impute.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index e55a7a7e19e57..d16e7479dd3a4 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -11,7 +11,7 @@ from scipy import sparse from scipy import stats -from .base import BaseEstimator, TransformerMixin, OneToOneMixin +from .base import BaseEstimator, TransformerMixin from .utils import check_array from .utils.sparsefuncs import _get_median from .utils.validation import check_is_fitted @@ -90,7 +90,7 @@ def _most_frequent(array, extra_value, n_repeat): return extra_value -class SimpleImputer(BaseEstimator, TransformerMixin, OneToOneMixin): +class SimpleImputer(BaseEstimator, TransformerMixin): """Imputation transformer for completing missing values. Read more in the :ref:`User Guide `. @@ -257,7 +257,8 @@ def fit(self, X, y=None): self.strategy, self.missing_values, fill_value) - + invalid_mask = _get_mask(self.statistics_, np.nan) + self._valid_mask = np.logical_not(invalid_mask) return self def _sparse_fit(self, X, strategy, missing_values, fill_value): @@ -373,8 +374,8 @@ def transform(self, X): valid_statistics = statistics else: # same as np.isnan but also works for object dtypes - invalid_mask = _get_mask(statistics, np.nan) - valid_mask = np.logical_not(invalid_mask) + valid_mask = self._valid_mask + invalid_mask = np.logical_not(valid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.flatnonzero(valid_mask) @@ -408,6 +409,11 @@ def transform(self, X): return X + def get_feature_names(self, input_features=None): + if input_features is None: + raise TypeError("Don't have input_features") + return np.array(input_features)[self._valid_mask] + class MissingIndicator(BaseEstimator, TransformerMixin): """Binary indicators for missing values. From 1c4a78f976f4498920c0c5de3530d9212f728796 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 15:01:49 -0500 Subject: [PATCH 003/100] use hasattr(transform) to check whether to use final estimator in get_feature_names --- sklearn/pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 0d370b2283a64..6388d4eea1844 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -533,8 +533,7 @@ def _pairwise(self): def get_feature_names(self, input_features=None): feature_names = input_features - with_final = hasattr(self._final_estimator, "get_feature_names") - + with_final = hasattr(self._final_estimator, "transform") for name, transform in self._iter(with_final=with_final): if not hasattr(transform, "get_feature_names"): raise TypeError("Transformer {} does provide" From 788193061f5c233cdf707c5c281cca40be5f47a7 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 15:39:56 -0500 Subject: [PATCH 004/100] add some docstrings --- sklearn/base.py | 15 +++++++++++++++ sklearn/impute.py | 12 ++++++++++++ sklearn/pipeline.py | 16 ++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/sklearn/base.py b/sklearn/base.py index b474f774bcf1c..1629fac63503e 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -517,6 +517,21 @@ class OneToOneMixin(object): """ def get_feature_names(self, input_features=None): + """Get feature names for transformation. + + Returns input_features as this transformation + doesn't add or drop features. + + Parameters + ---------- + input_feature : array-like of string + Input feature names. + + Returns + ------- + feature_names : array-like of string + Transformed feature names + """ if input_features is not None: return input_features else: diff --git a/sklearn/impute.py b/sklearn/impute.py index d16e7479dd3a4..5b23ab8f866a8 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -410,6 +410,18 @@ def transform(self, X): return X def get_feature_names(self, input_features=None): + """Get feature names for transformation. + + Parameters + ---------- + input_feature : array-like of string + Input feature names. + + Returns + ------- + feature_names : array-like of string + Transformed feature names + """ if input_features is None: raise TypeError("Don't have input_features") return np.array(input_features)[self._valid_mask] diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 6388d4eea1844..64e37e29b0365 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -532,6 +532,22 @@ def _pairwise(self): return getattr(self.steps[0][1], '_pairwise', False) def get_feature_names(self, input_features=None): + """Get feature names for transformation. + + Transform input features using the pipeline. + If the last step is a transformer, it's included + in the transformation, otherwise it's not. + + Parameters + ---------- + input_feature : array-like of string + Input feature names. + + Returns + ------- + feature_names : array-like of string + Transformed feature names + """ feature_names = input_features with_final = hasattr(self._final_estimator, "transform") for name, transform in self._iter(with_final=with_final): From de63353cd8e48d2dd14194c28ad33bf23053147f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 27 Nov 2018 15:33:51 -0500 Subject: [PATCH 005/100] fix docstring --- sklearn/base.py | 2 +- sklearn/impute.py | 2 +- sklearn/pipeline.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 1629fac63503e..392eb6ed60573 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -524,7 +524,7 @@ def get_feature_names(self, input_features=None): Parameters ---------- - input_feature : array-like of string + input_features : array-like of string Input feature names. Returns diff --git a/sklearn/impute.py b/sklearn/impute.py index 5b23ab8f866a8..c2368731e148e 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -414,7 +414,7 @@ def get_feature_names(self, input_features=None): Parameters ---------- - input_feature : array-like of string + input_features : array-like of string Input feature names. Returns diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 64e37e29b0365..3fb3119941491 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -540,7 +540,7 @@ def get_feature_names(self, input_features=None): Parameters ---------- - input_feature : array-like of string + input_features : array-like of string Input feature names. Returns From 449ed2310c8a0804c175adf81b216c137358c371 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 31 May 2019 10:49:15 -0400 Subject: [PATCH 006/100] fix merge issue --- sklearn/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 072e452dea9d6..97ddde7333641 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -628,7 +628,7 @@ def get_feature_names(self, input_features=None): """ feature_names = input_features with_final = hasattr(self._final_estimator, "transform") - for name, transform in self._iter(with_final=with_final): + for i, name, transform in self._iter(with_final=with_final): if not hasattr(transform, "get_feature_names"): raise TypeError("Transformer {} does provide" " get_feature_names".format(name)) From b929341a38400f03c9988d36deca81973ff0af7f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 21 May 2020 18:27:57 -0400 Subject: [PATCH 007/100] don't do magic slicing in pipeline.get_feature_names --- sklearn/pipeline.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 8c6ce36ac5d07..7734181426275 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -636,11 +636,12 @@ def get_feature_names(self, input_features=None): Transformed feature names """ feature_names = input_features - with_final = hasattr(self._final_estimator, "transform") - for i, name, transform in self._iter(with_final=with_final): + for i, name, transform in self._iter(): if not hasattr(transform, "get_feature_names"): - raise TypeError("Transformer {} does provide" - " get_feature_names".format(name)) + raise TypeError( + "Transformer {} does provide get_feature_names." + "Did you mean to call Pipeline[:-1].get_feature_names" + "()?".format(name)) try: feature_names = transform.get_feature_names( input_features=feature_names) From 2b613e5dea66516515fc5f4cdc15923ff6b4cd0e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 21 May 2020 18:34:31 -0400 Subject: [PATCH 008/100] fix merge issue --- sklearn/compose/_column_transformer.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 5186593a3a229..f4e3d2dfd3820 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -352,26 +352,27 @@ def get_feature_names(self): """ check_is_fitted(self) feature_names = [] - for name, trans, columns, _ in self._iter(fitted=True): - if trans == 'drop': + for name, trans, column, _ in self._iter(fitted=True): + if trans == 'drop' or ( + hasattr(column, '__len__') and not len(column)): continue if trans == 'passthrough': if hasattr(self, '_df_columns'): - if ((not isinstance(columns, slice)) - and all(isinstance(col, str) for col in columns)): - feature_names.extend(columns) + if ((not isinstance(column, slice)) + and all(isinstance(col, str) for col in column)): + feature_names.extend(column) else: - feature_names.extend(self._df_columns[columns]) + feature_names.extend(self._df_columns[column]) else: indices = np.arange(self._n_features) - feature_names.extend(['x%d' % i for i in indices[columns]]) + feature_names.extend(['x%d' % i for i in indices[column]]) continue if not hasattr(trans, 'get_feature_names'): raise AttributeError("Transformer %s (type %s) does not " "provide get_feature_names." % (str(name), type(trans).__name__)) try: - more_names = trans.get_feature_names(input_features=columns) + more_names = trans.get_feature_names(input_features=column) except TypeError: more_names = trans.get_feature_names() feature_names.extend([name + "__" + f for f in From 5eb76039bf48afd89a5322b884c3c2ab0c80b2cb Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 2 Jun 2020 15:51:16 -0400 Subject: [PATCH 009/100] trying to merge with input feature pr --- doc/modules/compose.rst | 41 ++++-- .../plot_column_transformer_mixed_types.py | 44 ++++++ .../plot_feature_selection_pipeline.py | 7 +- sklearn/base.py | 43 ++++++ .../compose/tests/test_column_transformer.py | 12 ++ sklearn/ensemble/_forest.py | 9 ++ sklearn/feature_selection/_base.py | 15 +++ sklearn/impute/_base.py | 31 +++++ sklearn/tests/test_pipeline.py | 127 ++++++++++++++++++ sklearn/utils/estimator_checks.py | 9 ++ 10 files changed, 327 insertions(+), 11 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 6388c9b7d4323..31486a9e4ae41 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -139,6 +139,29 @@ or by name:: >>> pipe['reduce_dim'] PCA() +To enable model inspection, `Pipeline` has an ``get_feature_names()`` method, +just like all transformers. You can use pipeline slicing to get the feature names +going into each step:: + + >>> from sklearn.datasets import load_iris + >>> from sklearn.feature_selection import SelectKBest + >>> iris = load_iris() + >>> pipe = Pipeline(steps=[ + ... ('select', SelectKBest(k=2)), + ... ('clf', LogisticRegression())]) + >>> pipe.fit(iris.data, iris.target) + ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + Pipeline(memory=None, + steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))]) + >>> pipe[:-1].get_feature_names() + array(['x2', 'x3'], dtype='>> pipe[:-1].get_feature_names(iris.feature_names) + ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] + .. topic:: Examples: * :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py` @@ -428,7 +451,7 @@ By default, the remaining rating columns are ignored (``remainder='drop'``):: >>> from sklearn.feature_extraction.text import CountVectorizer >>> from sklearn.preprocessing import OneHotEncoder >>> column_trans = ColumnTransformer( - ... [('city_category', OneHotEncoder(dtype='int'),['city']), + ... [('categories', OneHotEncoder(dtype='int'),['city']), ... ('title_bow', CountVectorizer(), 'title')], ... remainder='drop') @@ -438,11 +461,11 @@ By default, the remaining rating columns are ignored (``remainder='drop'``):: ('title_bow', CountVectorizer(), 'title')]) >>> column_trans.get_feature_names() - ['city_category__x0_London', 'city_category__x0_Paris', 'city_category__x0_Sallisaw', - 'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his', - 'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable', - 'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson', - 'title_bow__wrath'] + ['categories__city_London', 'categories__city_Paris', + 'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast', + 'title_bow__grapes', 'title_bow__his', 'title_bow__how', 'title_bow__last', + 'title_bow__learned', 'title_bow__moveable', 'title_bow__of', 'title_bow__the', + 'title_bow__trick', 'title_bow__watson', 'title_bow__wrath'] >>> column_trans.transform(X).toarray() array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0], @@ -459,7 +482,7 @@ to specify the column as a list of strings (``['city']``). Apart from a scalar or a single item list, the column selection can be specified as a list of multiple items, an integer array, a slice, a boolean mask, or -with a :func:`~sklearn.compose.make_column_selector`. The +with a :func:`~sklearn.compose.make_column_selector`. The :func:`~sklearn.compose.make_column_selector` is used to select columns based on data type or column name:: @@ -544,8 +567,8 @@ many estimators. This visualization is activated by setting the >>> # diplays HTML representation in a jupyter context >>> column_trans # doctest: +SKIP -An example of the HTML output can be seen in the -**HTML representation of Pipeline** section of +An example of the HTML output can be seen in the +**HTML representation of Pipeline** section of :ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`. As an alternative, the HTML can be written to a file using :func:`~sklearn.utils.estimator_html_repr`:: diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index bd4ed48cabfd9..f5e2b08f12b15 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -145,6 +145,50 @@ clf.fit(X_train, y_train) print("model score: %.3f" % clf.score(X_test, y_test)) + +############################################################################### +# Inspecting the coefficients values of the classifier +############################################################################### +# The coefficients of the final classification step of the pipeline gives an +# idea how each feature impacts the likelihood of survival assuming that the +# usual linear model assumptions hold (uncorrelated features, linear +# separability, homoschedastic errors...) which we do not verify in this +# example. +# +# To get error bars we perform cross-validation and compute the mean and +# standard deviation for each coefficient accross CV splits. Because we use a +# standard scaler on the numerical features, the coefficient weights gives us +# an idea on how much the log odds of surviving are impacted by a change in +# this dimension contrasted to the mean. Note that the categorical features +# here are overspecified which makes it slightly harder to interpret because of +# the information redundancy. +# +# We can see that the linear model coefficients are in agreement with the +# historical reports: people in higher classes and therefore in the upper decks +# were the first to reach the lifeboats, and often, priority was given to women +# and children. +# +# Note that conditionned on the "pclass_x" one-hot features, the "fare" +# numerical feature does not seem to be significantly predictive. If we drop +# the "pclass" feature, then higher "fare" values would appear significantly +# correlated with a higher likelihood of survival as the "fare" and "pclass" +# features have a strong statistical dependency. + +import matplotlib.pyplot as plt +from sklearn.model_selection import cross_validate +from sklearn.model_selection import StratifiedShuffleSplit + +cv = StratifiedShuffleSplit(n_splits=20, test_size=0.25, random_state=42) +cv_results = cross_validate(clf, X_train, y_train, cv=cv, + return_estimator=True) +cv_coefs = np.concatenate([cv_pipeline[-1].coef_ + for cv_pipeline in cv_results["estimator"]]) +fig, ax = plt.subplots() +ax.barh(clf[:-1].get_feature_names(), + cv_coefs.mean(axis=0), xerr=cv_coefs.std(axis=0)) +plt.tight_layout() +plt.show() + ############################################################################### # The resulting score is not exactly the same as the one from the previous # pipeline becase the dtype-based selector treats the ``pclass`` columns as diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py index b908def5c6709..8298e9b24528f 100644 --- a/examples/feature_selection/plot_feature_selection_pipeline.py +++ b/examples/feature_selection/plot_feature_selection_pipeline.py @@ -9,6 +9,7 @@ Using a sub-pipeline, the fitted coefficients can be mapped back into the original feature space. """ +import matplotlib.pyplot as plt from sklearn import svm from sklearn.datasets import make_classification from sklearn.feature_selection import SelectKBest, f_regression @@ -36,5 +37,7 @@ y_pred = anova_svm.predict(X_test) print(classification_report(y_test, y_pred)) -coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_) -print(coef) +# access and plot the coefficients of the fitted model +plt.barh((0, 1, 2), anova_svm[-1].coef_.ravel()) +plt.yticks((0, 1, 2), anova_svm[:-1].get_feature_names()) +plt.show() diff --git a/sklearn/base.py b/sklearn/base.py index 4ee88a0bdd1c5..72aebdacb4082 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -688,6 +688,49 @@ def fit_transform(self, X, y=None, **fit_params): # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X) + def get_feature_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : list of string or None + String names of the input features. + + Returns + ------- + output_feature_names : list of string + Feature names for transformer output. + """ + # OneToOneMixin is higher in the class hierarchy + # because we put mixins on the wrong side + if hasattr(super(), 'get_feature_names'): + return super().get_feature_names(input_features) + # generate feature names from class name by default + # would be much less guessing if we stored the number + # of output features. + # Ideally this would be done in each class. + if hasattr(self, 'n_clusters'): + # this is before n_components_ + # because n_components_ means something else + # in agglomerative clustering + n_features = self.n_clusters + elif hasattr(self, '_max_components'): + # special case for LinearDiscriminantAnalysis + n_components = self.n_components or np.inf + n_features = min(self._max_components, n_components) + elif hasattr(self, 'n_components_'): + # n_components could be auto or None + # this is more likely to be an int + n_features = self.n_components_ + elif hasattr(self, 'n_components') and self.n_components is not None: + n_features = self.n_components + elif hasattr(self, 'components_'): + n_features = self.components_.shape[0] + else: + return None + return ["{}{}".format(type(self).__name__.lower(), i) + for i in range(n_features)] + class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index a9f1764eb97e4..8b67fa77652b1 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -23,6 +23,7 @@ from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder from sklearn.feature_extraction import DictVectorizer +from sklearn.pipeline import make_pipeline class Trans(BaseEstimator): @@ -660,6 +661,17 @@ def test_column_transformer_get_feature_names(): "Transformer trans (type Trans) does not provide " "get_feature_names", ct.get_feature_names) + # if some transformers support and some don't + ct = ColumnTransformer([('trans', Trans(), [0, 1]), + ('scale', StandardScaler(), [0])]) + ct.fit(X_array) + assert_raise_message(AttributeError, + "Transformer trans (type Trans) does not provide " + "get_feature_names", ct.get_feature_names) + + # inside a pipeline + make_pipeline(ct).fit(X_array) + # working example X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}], [{'c': 5}, {'c': 6}]], dtype=object).T diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 98d606961c1e1..30b5370179144 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -2357,3 +2357,12 @@ def transform(self, X): """ check_is_fitted(self) return self.one_hot_encoder_.transform(self.apply(X)) + + def get_feature_names(self, input_features=None): + """Feature names - not implemented yet. + + Parameters + ---------- + input_features : list of strings or None + """ + return None diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 8644cf0ca2aef..babb0aa84ddbf 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -125,6 +125,21 @@ def inverse_transform(self, X): Xt[:, support] = X return Xt + def get_feature_names(self, input_features=None): + """Mask feature names according to selected features. + + Parameters + ---------- + input_features : list of string or None + Input features to select from. If none, they are generated as + x0, x1, ..., xn. + """ + mask = self.get_support() + if input_features is None: + input_features = ['x%d' % i + for i in range(mask.shape[0])] + return np.array(input_features)[mask] + def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1): diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 517de982d8478..b8ad193622e4c 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -312,6 +312,15 @@ def fit(self, X, y=None): self.strategy, self.missing_values, fill_value) + + if self.add_indicator: + self.indicator_ = MissingIndicator( + missing_values=self.missing_values) + self.indicator_.fit(X) + else: + self.indicator_ = None + invalid_mask = _get_mask(self.statistics_, np.nan) + self._valid_mask = np.logical_not(invalid_mask) return self def _sparse_fit(self, X, strategy, missing_values, fill_value): @@ -462,6 +471,28 @@ def transform(self, X): return super()._concatenate_indicator(X, X_indicator) + def _more_tags(self): + return {'allow_nan': True} + + def get_feature_names(self, input_features=None): + """Get feature names for transformation. + + Parameters + ---------- + input_features : array-like of string + Input feature names. + + Returns + ------- + feature_names : array-like of string + Transformed feature names + """ + check_is_fitted(self, 'statistics_') + if input_features is None: + input_features = ['x%d' % i + for i in range(self.statistics_.shape[0])] + return np.array(input_features)[self._valid_mask] + class MissingIndicator(TransformerMixin, BaseEstimator): """Binary indicators for missing values. diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 9f61b12e24ca1..af64f1d531205 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -21,18 +21,21 @@ from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_no_warnings +from sklearn.exceptions import NotFittedError from sklearn.base import clone, BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union from sklearn.svm import SVC from sklearn.neighbors import LocalOutlierFactor from sklearn.linear_model import LogisticRegression, Lasso from sklearn.linear_model import LinearRegression +from sklearn.multiclass import OneVsRestClassifier from sklearn.cluster import KMeans from sklearn.feature_selection import SelectKBest, f_classif from sklearn.dummy import DummyRegressor from sklearn.decomposition import PCA, TruncatedSVD from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler +from sklearn.impute import SimpleImputer from sklearn.feature_extraction.text import CountVectorizer from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier @@ -1101,6 +1104,130 @@ def test_make_pipeline_memory(): shutil.rmtree(cachedir) +def test_set_input_features(): + pipe = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler()), + ('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]) + assert_raises(NotFittedError, pipe.get_feature_names) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + assert_array_equal(pipe[:1].get_feature_names(), xs) + mask = pipe.named_steps.select.get_support() + assert_array_equal(pipe[:-1].get_feature_names(), xs[mask]) + res = pipe.get_feature_names(iris.feature_names) + # LogisticRegression doesn't have get_feature_names + assert res is None + assert_array_equal(pipe[:1].get_feature_names(iris.feature_names), + iris.feature_names) + assert_array_equal(pipe[:-1].get_feature_names(iris.feature_names), + np.array(iris.feature_names)[mask]) + pipe = Pipeline(steps=[ + ('scaler', StandardScaler()), + ('pca', PCA(n_components=3)), + ('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]) + pipe.fit(iris.data, iris.target) + assert_array_equal(pipe[:-1].get_feature_names(), ['pca0', 'pca1']) + # setting names doesn't change names after PCA + assert_array_equal(pipe[:-2].get_feature_names(iris.feature_names), + ['pca0', 'pca1', 'pca2']) + + +def test_input_feature_names_pandas(): + pd = pytest.importorskip("pandas") + pipe = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler()), + ('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]) + iris = load_iris() + df = pd.DataFrame(iris.data, columns=iris.feature_names) + pipe.fit(df, iris.target) + mask = pipe.named_steps.select.get_support() + assert_array_equal(pipe[:-1].get_feature_names(), + np.array(iris.feature_names)[mask]) + + +def test_features_names_passthrough(): + pipe = Pipeline(steps=[ + ('imputer', 'passthrough'), + ('scaler', StandardScaler()), + ('select', 'passthrough'), + ('clf', LogisticRegression())]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = ['x0', 'x1', 'x2', 'x3'] + assert_array_equal(pipe[:-1].get_feature_names(), xs) + assert_array_equal(pipe[:-1].get_feature_names(iris.feature_names), + iris.feature_names) + + +def test_feature_names_count_vectorizer(): + pipe = Pipeline(steps=[ + ('vect', CountVectorizer()), + ('clf', LogisticRegression())]) + y = ["pizza" in x for x in JUNK_FOOD_DOCS] + pipe.fit(JUNK_FOOD_DOCS, y) + assert_array_equal(pipe[:-1].get_feature_names(), + ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) + assert_array_equal(pipe[:-1].get_feature_names("nonsense_is_ignored"), + ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) + + +def test_feature_names_nested(): + pipe = Pipeline(steps=[ + ('inner_pipe', Pipeline(steps=[('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]))]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + mask = pipe.named_steps.inner_pipe.named_steps.select.get_support() + assert_array_equal( + pipe.named_steps.inner_pipe[:1].get_feature_names(), xs[mask]) + assert_array_equal( + pipe.named_steps.inner_pipe[:1].get_feature_names(iris.feature_names), + np.array(iris.feature_names)[mask]) + + +def test_feature_names_meta_pipe(): + ovr = OneVsRestClassifier(Pipeline(steps=[('select', SelectKBest(k=2)), + ('clf', LogisticRegression())])) + pipe = Pipeline(steps=[('ovr', ovr)]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + assert_array_equal(pipe.input_features_, xs) + # check 0ths estimator in OVR only + inner_pipe = pipe.named_steps.ovr.estimators_[0] + mask = inner_pipe.named_steps.select.get_support() + assert_array_equal(inner_pipe.named_steps.clf.input_features_, xs[mask]) + pipe.get_feature_names(iris.feature_names) + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(inner_pipe.input_features_, iris.feature_names) + assert_array_equal(inner_pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) + + +def test_input_features_meta(): + ovr = OneVsRestClassifier(LogisticRegression()) + pipe = Pipeline(steps=[('select', SelectKBest(k=2)), ('ovr', ovr)]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + assert_array_equal(pipe.input_features_, xs) + # check 0ths estimator in OVR only + one_logreg = pipe.named_steps.ovr.estimators_[0] + mask = pipe.named_steps.select.get_support() + assert_array_equal(one_logreg.input_features_, xs[mask]) + pipe.get_feature_names(iris.feature_names) + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(one_logreg.input_features_, + np.array(iris.feature_names)[mask]) + + def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises(ValueError, match="Pipeline.fit does not accept " diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index bbde6264a1c77..c93e05e43e0be 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1257,6 +1257,15 @@ def _check_transformer(name, transformer_orig, X, y): transformer_clone = clone(transformer) X_pred = transformer_clone.fit_transform(X, y=y_) + input_features = ['feature%d' % i for i in range(n_features)] + if hasattr(transformer_clone, 'get_feature_names'): + feature_names = transformer_clone.get_feature_names(input_features) + if feature_names is not None: + if isinstance(X_pred, tuple): + assert len(feature_names) == X_pred[0].shape[1] + else: + assert len(feature_names) == X_pred.shape[1] + if isinstance(X_pred, tuple): for x_pred in X_pred: assert x_pred.shape[0] == n_samples From 3a9054c5a9f6a4ba6513a849e70d8d54e657d65a Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 2 Jun 2020 16:01:03 -0400 Subject: [PATCH 010/100] remove tests taht don't apply --- sklearn/tests/test_pipeline.py | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index af64f1d531205..de553ef2623c9 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1199,32 +1199,12 @@ def test_feature_names_meta_pipe(): iris = load_iris() pipe.fit(iris.data, iris.target) xs = np.array(['x0', 'x1', 'x2', 'x3']) - assert_array_equal(pipe.input_features_, xs) - # check 0ths estimator in OVR only - inner_pipe = pipe.named_steps.ovr.estimators_[0] - mask = inner_pipe.named_steps.select.get_support() - assert_array_equal(inner_pipe.named_steps.clf.input_features_, xs[mask]) - pipe.get_feature_names(iris.feature_names) - assert_array_equal(pipe.input_features_, iris.feature_names) - assert_array_equal(inner_pipe.input_features_, iris.feature_names) - assert_array_equal(inner_pipe.named_steps.clf.input_features_, - np.array(iris.feature_names)[mask]) - - -def test_input_features_meta(): - ovr = OneVsRestClassifier(LogisticRegression()) - pipe = Pipeline(steps=[('select', SelectKBest(k=2)), ('ovr', ovr)]) - iris = load_iris() - pipe.fit(iris.data, iris.target) - xs = np.array(['x0', 'x1', 'x2', 'x3']) - assert_array_equal(pipe.input_features_, xs) + assert_array_equal(pipe[:-1].get_feature_names(), xs) # check 0ths estimator in OVR only - one_logreg = pipe.named_steps.ovr.estimators_[0] - mask = pipe.named_steps.select.get_support() - assert_array_equal(one_logreg.input_features_, xs[mask]) - pipe.get_feature_names(iris.feature_names) - assert_array_equal(pipe.input_features_, iris.feature_names) - assert_array_equal(one_logreg.input_features_, + inner_pipe = pipe['ovr'].estimators_[0] + mask = inner_pipe['select'].get_support() + assert_array_equal(inner_pipe[:-1].get_feature_names(), xs[mask]) + assert_array_equal(inner_pipe[:-1].get_feature_names(iris.feature_names), np.array(iris.feature_names)[mask]) From 76f5b544f89e8745873c2b518d8fa01084064b35 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 16:09:42 -0400 Subject: [PATCH 011/100] fix onetoone mixing feature names --- sklearn/base.py | 4 ++-- sklearn/tests/test_pipeline.py | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 8765de87b6e9d..a97f7a1580369 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -806,8 +806,8 @@ def get_feature_names(self, input_features=None): if input_features is not None: return input_features else: - raise ValueError("Don't know how to get" - " input feature names for {}".format(self)) + return ["x{}".format(i) + for i in range(self.n_features_in_)] class MetaEstimatorMixin: diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index de553ef2623c9..0770467dbfd50 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1104,22 +1104,23 @@ def test_make_pipeline_memory(): shutil.rmtree(cachedir) -def test_set_input_features(): +def test_feature_names_basic(): pipe = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()), ('select', SelectKBest(k=2)), ('clf', LogisticRegression())]) - assert_raises(NotFittedError, pipe.get_feature_names) + with pytest.raises(NotFittedError): + pipe.get_feature_names() iris = load_iris() pipe.fit(iris.data, iris.target) xs = np.array(['x0', 'x1', 'x2', 'x3']) assert_array_equal(pipe[:1].get_feature_names(), xs) mask = pipe.named_steps.select.get_support() assert_array_equal(pipe[:-1].get_feature_names(), xs[mask]) - res = pipe.get_feature_names(iris.feature_names) - # LogisticRegression doesn't have get_feature_names - assert res is None + with pytest.raises(TypeError, + match="Transformer clf does provide get_feature_names."): + pipe.get_feature_names(iris.feature_names) assert_array_equal(pipe[:1].get_feature_names(iris.feature_names), iris.feature_names) assert_array_equal(pipe[:-1].get_feature_names(iris.feature_names), From 52f38e1844c9a529c7e0237d7afffc289c100854 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 16:12:01 -0400 Subject: [PATCH 012/100] remove more tests --- sklearn/tests/test_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 0770467dbfd50..4f2971269a1ec 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1148,7 +1148,8 @@ def test_input_feature_names_pandas(): df = pd.DataFrame(iris.data, columns=iris.feature_names) pipe.fit(df, iris.target) mask = pipe.named_steps.select.get_support() - assert_array_equal(pipe[:-1].get_feature_names(), + # for now assuming we have to pass these explicitly + assert_array_equal(pipe[:-1].get_feature_names(iris.feature_names), np.array(iris.feature_names)[mask]) @@ -1200,7 +1201,6 @@ def test_feature_names_meta_pipe(): iris = load_iris() pipe.fit(iris.data, iris.target) xs = np.array(['x0', 'x1', 'x2', 'x3']) - assert_array_equal(pipe[:-1].get_feature_names(), xs) # check 0ths estimator in OVR only inner_pipe = pipe['ovr'].estimators_[0] mask = inner_pipe['select'].get_support() From cdda1fb38cf5d49ae5576e504d2b58ee1d428ca1 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 17:05:56 -0400 Subject: [PATCH 013/100] fix test for better expected outputs --- sklearn/compose/tests/test_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 8b67fa77652b1..1fae9b8271082 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1379,4 +1379,4 @@ def test_feature_names_empty_columns(empty_col): ) ct.fit(df) - assert ct.get_feature_names() == ['ohe__x0_a', 'ohe__x0_b', 'ohe__x1_z'] + assert ct.get_feature_names() == ['ohe__col1_a', 'ohe__col1_b', 'ohe__col2_z'] From 5f4abbcf96201c415a3bf97d65dab6cad65f599c Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 17:19:17 -0400 Subject: [PATCH 014/100] fix priorities in catch-all get_feature_names --- sklearn/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index a97f7a1580369..7ef2db57eae93 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -723,10 +723,10 @@ def get_feature_names(self, input_features=None): # n_components could be auto or None # this is more likely to be an int n_features = self.n_components_ - elif hasattr(self, 'n_components') and self.n_components is not None: - n_features = self.n_components elif hasattr(self, 'components_'): n_features = self.components_.shape[0] + elif hasattr(self, 'n_components') and self.n_components is not None: + n_features = self.n_components else: return None return ["{}{}".format(type(self).__name__.lower(), i) From 4305a28083ad2fe7f54f2db1e215670860a88482 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 17:24:47 -0400 Subject: [PATCH 015/100] flake8 --- sklearn/compose/tests/test_column_transformer.py | 3 ++- sklearn/tests/test_pipeline.py | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 1fae9b8271082..22a42dc309d5e 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1379,4 +1379,5 @@ def test_feature_names_empty_columns(empty_col): ) ct.fit(df) - assert ct.get_feature_names() == ['ohe__col1_a', 'ohe__col1_b', 'ohe__col2_z'] + assert ct.get_feature_names() == ['ohe__col1_a', 'ohe__col1_b', + 'ohe__col2_z'] diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 4f2971269a1ec..6599b7cca2f08 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -175,8 +175,8 @@ def test_pipeline_init(): clf = NoTrans() pipe = Pipeline([('svc', clf)]) assert (pipe.get_params(deep=True) == - dict(svc__a=None, svc__b=None, svc=clf, - **pipe.get_params(deep=False))) + dict(svc__a=None, svc__b=None, svc=clf, + **pipe.get_params(deep=False))) # Check that params are set pipe.set_params(svc__a=0.1) @@ -1118,8 +1118,9 @@ def test_feature_names_basic(): assert_array_equal(pipe[:1].get_feature_names(), xs) mask = pipe.named_steps.select.get_support() assert_array_equal(pipe[:-1].get_feature_names(), xs[mask]) - with pytest.raises(TypeError, - match="Transformer clf does provide get_feature_names."): + with pytest.raises( + TypeError, + match="Transformer clf does provide get_feature_names."): pipe.get_feature_names(iris.feature_names) assert_array_equal(pipe[:1].get_feature_names(iris.feature_names), iris.feature_names) From c387b5bafbdb6cf34b40aba4d894d4a77ff02a39 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 17:27:08 -0400 Subject: [PATCH 016/100] remove redundant code --- sklearn/impute/_base.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index b8ad193622e4c..93c499679ed22 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -313,12 +313,6 @@ def fit(self, X, y=None): self.missing_values, fill_value) - if self.add_indicator: - self.indicator_ = MissingIndicator( - missing_values=self.missing_values) - self.indicator_.fit(X) - else: - self.indicator_ = None invalid_mask = _get_mask(self.statistics_, np.nan) self._valid_mask = np.logical_not(invalid_mask) return self From 2fefb67da057c3eafd0564747aaf321b1ae64656 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 18:15:25 -0400 Subject: [PATCH 017/100] fix error message --- sklearn/pipeline.py | 2 +- sklearn/tests/test_pipeline.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 7734181426275..5972cdb98f346 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -639,7 +639,7 @@ def get_feature_names(self, input_features=None): for i, name, transform in self._iter(): if not hasattr(transform, "get_feature_names"): raise TypeError( - "Transformer {} does provide get_feature_names." + "Estimator {} does provide get_feature_names. " "Did you mean to call Pipeline[:-1].get_feature_names" "()?".format(name)) try: diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 6599b7cca2f08..32869d14a41cc 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1120,7 +1120,7 @@ def test_feature_names_basic(): assert_array_equal(pipe[:-1].get_feature_names(), xs[mask]) with pytest.raises( TypeError, - match="Transformer clf does provide get_feature_names."): + match="Estimator clf does provide get_feature_names."): pipe.get_feature_names(iris.feature_names) assert_array_equal(pipe[:1].get_feature_names(iris.feature_names), iris.feature_names) From a6832c3429e7b16e6d6834d6d95e1fc77fc95b13 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 19:36:20 -0400 Subject: [PATCH 018/100] fix mixin order --- sklearn/base.py | 4 ---- sklearn/preprocessing/_data.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 7ef2db57eae93..67b10281919a5 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -702,10 +702,6 @@ def get_feature_names(self, input_features=None): output_feature_names : list of string Feature names for transformer output. """ - # OneToOneMixin is higher in the class hierarchy - # because we put mixins on the wrong side - if hasattr(super(), 'get_feature_names'): - return super().get_feature_names(input_features) # generate feature names from class name by default # would be much less guessing if we stored the number # of output features. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index e58387c973fa3..8552c4bc45fce 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -19,7 +19,7 @@ from scipy import optimize from scipy.special import boxcox -from ..base import BaseEstimator, TransformerMixin, OneToOneMixin +from ..base import BaseEstimator, OneToOneMixin, TransformerMixin from ..utils import check_array from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var @@ -202,7 +202,7 @@ def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): return X -class MinMaxScaler(TransformerMixin, OneToOneMixin, BaseEstimator): +class MinMaxScaler(OneToOneMixin, TransformerMixin, BaseEstimator): """Transform features by scaling each feature to a given range. This estimator scales and translates each feature individually such @@ -524,7 +524,7 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): return X -class StandardScaler(TransformerMixin, OneToOneMixin, BaseEstimator): +class StandardScaler(OneToOneMixin, TransformerMixin, BaseEstimator): """Standardize features by removing the mean and scaling to unit variance The standard score of a sample `x` is calculated as: @@ -871,7 +871,7 @@ def _more_tags(self): return {'allow_nan': True} -class MaxAbsScaler(TransformerMixin, OneToOneMixin, BaseEstimator): +class MaxAbsScaler(OneToOneMixin, TransformerMixin, BaseEstimator): """Scale each feature by its maximum absolute value. This estimator scales and translates each feature individually such @@ -1131,7 +1131,7 @@ def maxabs_scale(X, *, axis=0, copy=True): return X -class RobustScaler(TransformerMixin, OneToOneMixin, BaseEstimator): +class RobustScaler(OneToOneMixin, TransformerMixin, BaseEstimator): """Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to @@ -1840,7 +1840,7 @@ def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False): return X -class Normalizer(TransformerMixin, OneToOneMixin, BaseEstimator): +class Normalizer(OneToOneMixin, TransformerMixin, BaseEstimator): """Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one @@ -2001,7 +2001,7 @@ def binarize(X, *, threshold=0.0, copy=True): return X -class Binarizer(TransformerMixin, OneToOneMixin, BaseEstimator): +class Binarizer(OneToOneMixin, TransformerMixin, BaseEstimator): """Binarize data (set feature values to 0 or 1) according to a threshold Values greater than the threshold map to 1, while values less than @@ -2270,7 +2270,7 @@ def add_dummy_feature(X, value=1.0): return np.hstack((np.full((n_samples, 1), value), X)) -class QuantileTransformer(TransformerMixin, OneToOneMixin, BaseEstimator): +class QuantileTransformer(OneToOneMixin, TransformerMixin, BaseEstimator): """Transform features using quantiles information. This method transforms the features to follow a uniform or a normal @@ -2829,7 +2829,7 @@ def quantile_transform(X, *, axis=0, n_quantiles=1000, " axis={}".format(axis)) -class PowerTransformer(TransformerMixin, OneToOneMixin, BaseEstimator): +class PowerTransformer(OneToOneMixin, TransformerMixin, BaseEstimator): """Apply a power transform featurewise to make data more Gaussian-like. Power transforms are a family of parametric, monotonic transformations From 0f45b221ec28beaf5558d5c0698c59e30b2adbf7 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 19:51:58 -0400 Subject: [PATCH 019/100] small refactor with helper function --- sklearn/base.py | 13 ++++++------- sklearn/feature_selection/_base.py | 6 +++--- sklearn/impute/_base.py | 6 +++--- sklearn/utils/_feature_names.py | 25 +++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 13 deletions(-) create mode 100644 sklearn/utils/_feature_names.py diff --git a/sklearn/base.py b/sklearn/base.py index 67b10281919a5..ef9df8b1a0011 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -17,6 +17,7 @@ from .utils import _IS_32BIT from .utils.validation import check_X_y from .utils.validation import check_array +from .utils._feature_names import _make_feature_names from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _deprecate_positional_args @@ -725,8 +726,8 @@ def get_feature_names(self, input_features=None): n_features = self.n_components else: return None - return ["{}{}".format(type(self).__name__.lower(), i) - for i in range(n_features)] + return _make_feature_names(n_features=n_features, + prefix=type(self).__name__.lower()) class DensityMixin: @@ -799,11 +800,9 @@ def get_feature_names(self, input_features=None): feature_names : array-like of string Transformed feature names """ - if input_features is not None: - return input_features - else: - return ["x{}".format(i) - for i in range(self.n_features_in_)] + + return _make_feature_names(self.n_features_in_, + input_features=input_features) class MetaEstimatorMixin: diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index babb0aa84ddbf..4266d455052c9 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -15,6 +15,7 @@ from ..utils import check_array from ..utils import safe_mask from ..utils import safe_sqr +from ..utils._feature_names import _make_feature_names class SelectorMixin(TransformerMixin, metaclass=ABCMeta): @@ -135,9 +136,8 @@ def get_feature_names(self, input_features=None): x0, x1, ..., xn. """ mask = self.get_support() - if input_features is None: - input_features = ['x%d' % i - for i in range(mask.shape[0])] + input_features = _make_feature_names(mask.shape[0], + input_features=input_features) return np.array(input_features)[mask] diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 93c499679ed22..37cdd93bacb13 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -16,6 +16,7 @@ from ..utils.validation import FLOAT_DTYPES from ..utils.validation import _deprecate_positional_args from ..utils._mask import _get_mask +from ..utils._feature_names import _make_feature_names from ..utils import is_scalar_nan @@ -482,9 +483,8 @@ def get_feature_names(self, input_features=None): Transformed feature names """ check_is_fitted(self, 'statistics_') - if input_features is None: - input_features = ['x%d' % i - for i in range(self.statistics_.shape[0])] + input_features = _make_feature_names(self.statistics_.shape[0], + input_features=input_features) return np.array(input_features)[self._valid_mask] diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py new file mode 100644 index 0000000000000..402fda2fe17cb --- /dev/null +++ b/sklearn/utils/_feature_names.py @@ -0,0 +1,25 @@ +def _make_feature_names(n_features, prefix='x', input_features=None): + """Make feature name strings from n_features. + + Either returns input_feature names if it is not None, or creates + placeholder names based on n_features, by default, + ['x0', 'x1', ..., 'xn_features'] is generated. + + Parameters + ---------- + n_features : int + Number of feature names to generate + prefix : str, default='x' + Prefix for each feature name. + input_features : array-like of string + Optional existing input features, returned unchanged if not None. + + Returns + ------- + feature_names : list of str + Generated feature names of length n_features. + """ + if input_features is not None: + return input_features + return ["{}{}".format(prefix, i) + for i in range(n_features)] From 4717a737ba2fd0702bb9a20ffdb4e39101b9eb21 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 20:03:40 -0400 Subject: [PATCH 020/100] linting for new options --- doc/modules/compose.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 31486a9e4ae41..f2204baf913bb 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -150,7 +150,6 @@ going into each step:: ... ('select', SelectKBest(k=2)), ... ('clf', LogisticRegression())]) >>> pipe.fit(iris.data, iris.target) - ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS Pipeline(memory=None, steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))]) >>> pipe[:-1].get_feature_names() From a658ba77a3c8f5ff9dcef470bf6424544e4c5067 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 20:10:35 -0400 Subject: [PATCH 021/100] add feature names to lineardiscriminantanalysis and birch --- sklearn/cluster/_birch.py | 18 ++++++++++++++++++ sklearn/discriminant_analysis.py | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index cdb21ccbff044..207cf9ff4ea18 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -15,6 +15,7 @@ from ..utils import check_array from ..utils.extmath import row_norms from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils._feature_names import _make_feature_names from ..exceptions import ConvergenceWarning from . import AgglomerativeClustering @@ -656,3 +657,20 @@ def _global_clustering(self, X=None): if compute_labels: self.labels_ = self.predict(X) + + def get_feature_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : list of string or None + String names of the input features. + + Returns + ------- + output_feature_names : list of string + Feature names for transformer output. + """ + return _make_feature_names( + n_features=self.subcluster_centers_.shape[0], + prefix=type(self).__name__.lower()) diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 96a17fc5a34a5..efd546e54fd09 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -24,6 +24,7 @@ from .utils.extmath import softmax from .preprocessing import StandardScaler from .utils.validation import _deprecate_positional_args +from .utils._feature_names import _make_feature_names __all__ = ['LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis'] @@ -562,6 +563,23 @@ def decision_function(self, X): # Only override for the doc return super().decision_function(X) + def get_feature_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : list of string or None + String names of the input features. + + Returns + ------- + output_feature_names : list of string + Feature names for transformer output. + """ + n_components = self.n_components or np.inf + n_features = min(self._max_components, n_components) + return _make_feature_names(n_features=n_features, + prefix=type(self).__name__.lower()) class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): """Quadratic Discriminant Analysis From e9e45af006eb60dbb60e3dac31e25bd9d5aca982 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 2 Jun 2020 20:26:15 -0400 Subject: [PATCH 022/100] add get_feature_names in a couple more places --- sklearn/cluster/_kmeans.py | 35 ++++++++++++++++++++++++++++++++ sklearn/discriminant_analysis.py | 1 + sklearn/impute/_base.py | 17 ++++++++++++++++ sklearn/neighbors/_graph.py | 35 ++++++++++++++++++++++++++++++++ 4 files changed, 88 insertions(+) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 38d8120bf452c..11afc1bd0311c 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -28,6 +28,7 @@ from ..utils import check_random_state from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils._openmp_helpers import _openmp_effective_n_threads +from ..utils._feature_names import _make_feature_names from ..exceptions import ConvergenceWarning from ._k_means_fast import _inertia_dense from ._k_means_fast import _inertia_sparse @@ -1215,6 +1216,23 @@ def score(self, X, y=None, sample_weight=None): return -_labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[1] + def get_feature_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : list of string or None + String names of the input features. + + Returns + ------- + output_feature_names : list of string + Feature names for transformer output. + """ + return _make_feature_names( + n_features=self.n_clusters, + prefix=type(self).__name__.lower()) + def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, old_center_buffer, compute_squared_diff, @@ -1871,3 +1889,20 @@ def predict(self, X, sample_weight=None): X = self._check_test_data(X) return self._labels_inertia_minibatch(X, sample_weight)[0] + + def get_feature_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : list of string or None + String names of the input features. + + Returns + ------- + output_feature_names : list of string + Feature names for transformer output. + """ + return _make_feature_names( + n_features=self.n_clusters, + prefix=type(self).__name__.lower()) diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index efd546e54fd09..d36f778dea454 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -581,6 +581,7 @@ def get_feature_names(self, input_features=None): return _make_feature_names(n_features=n_features, prefix=type(self).__name__.lower()) + class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): """Quadratic Discriminant Analysis diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 37cdd93bacb13..eef573d57feae 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -758,3 +758,20 @@ def fit_transform(self, X, y=None): def _more_tags(self): return {'allow_nan': True, 'X_types': ['2darray', 'string']} + + def get_feature_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : list of string or None + String names of the input features. + + Returns + ------- + output_feature_names : list of string + Feature names for transformer output. + """ + return _make_feature_names( + n_features=len(self.features_), + prefix=type(self).__name__.lower()) diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index 6bf8da3f4ef5e..393577d0a70dd 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -10,6 +10,7 @@ from ._unsupervised import NearestNeighbors from ..base import TransformerMixin from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils._feature_names import _make_feature_names def _check_params(X, metric, p, metric_params): @@ -338,6 +339,23 @@ def fit_transform(self, X, y=None): """ return self.fit(X).transform(X) + def get_feature_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : list of string or None + String names of the input features. + + Returns + ------- + output_feature_names : list of string + Feature names for transformer output. + """ + return _make_feature_names( + n_features=self.n_samples_fit_, + prefix=type(self).__name__.lower()) + class RadiusNeighborsTransformer(RadiusNeighborsMixin, UnsupervisedMixin, TransformerMixin, NeighborsBase): @@ -478,3 +496,20 @@ def fit_transform(self, X, y=None): The matrix is of CSR format. """ return self.fit(X).transform(X) + + def get_feature_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : list of string or None + String names of the input features. + + Returns + ------- + output_feature_names : list of string + Feature names for transformer output. + """ + return _make_feature_names( + n_features=self.n_samples_fit_, + prefix=type(self).__name__.lower()) From 5acacedbe6e11230cd4283a65310dc3dffe83a6d Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Wed, 3 Jun 2020 14:13:52 -0400 Subject: [PATCH 023/100] fix up docs --- doc/modules/compose.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index f2204baf913bb..b84453e750628 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -150,8 +150,7 @@ going into each step:: ... ('select', SelectKBest(k=2)), ... ('clf', LogisticRegression())]) >>> pipe.fit(iris.data, iris.target) - Pipeline(memory=None, - steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))]) + Pipeline(steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))]) >>> pipe[:-1].get_feature_names() array(['x2', 'x3'], dtype='>> pipe[:-1].get_feature_names(iris.feature_names) - ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] + array(['petal length (cm)', 'petal width (cm)'], dtype='>> column_trans.fit(X) - ColumnTransformer(transformers=[('city_category', OneHotEncoder(dtype='int'), + ColumnTransformer(transformers=[('categories', OneHotEncoder(dtype='int'), ['city']), ('title_bow', CountVectorizer(), 'title')]) From 0353f69aee560f7b849b5f48691021a4ada053e0 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Wed, 3 Jun 2020 14:41:31 -0400 Subject: [PATCH 024/100] make example actually work --- examples/feature_selection/plot_feature_selection_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py index 8298e9b24528f..c40d73d89c5ef 100644 --- a/examples/feature_selection/plot_feature_selection_pipeline.py +++ b/examples/feature_selection/plot_feature_selection_pipeline.py @@ -21,7 +21,7 @@ # import some data to play with X, y = make_classification( - n_features=20, n_informative=3, n_redundant=0, n_classes=4, + n_features=20, n_informative=3, n_redundant=0, n_classes=2, n_clusters_per_class=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) From bb07886418ba93cb91cce68986f856fd7a1b69fc Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 22 Sep 2020 21:29:43 -0400 Subject: [PATCH 025/100] ENH Converts to get_output_names --- doc/glossary.rst | 9 ++ doc/modules/compose.rst | 10 +- doc/modules/feature_extraction.rst | 18 ++-- .../plot_topics_extraction_with_nmf_lda.py | 6 +- .../bicluster/plot_bicluster_newsgroups.py | 2 +- .../plot_column_transformer_mixed_types.py | 2 +- .../plot_feature_selection_pipeline.py | 2 +- ...linear_model_coefficient_interpretation.py | 2 +- .../inspection/plot_permutation_importance.py | 2 +- ...ot_document_classification_20newsgroups.py | 2 +- examples/text/plot_document_clustering.py | 2 +- .../text/plot_hashing_vs_dict_vectorizer.py | 2 +- sklearn/base.py | 20 ++-- sklearn/cluster/_birch.py | 10 +- sklearn/cluster/_kmeans.py | 20 ++-- sklearn/compose/_column_transformer.py | 39 ++++++++ sklearn/discriminant_analysis.py | 10 +- sklearn/ensemble/_forest.py | 9 -- .../feature_extraction/_dict_vectorizer.py | 15 +++ sklearn/feature_extraction/text.py | 15 ++- sklearn/feature_selection/_base.py | 11 ++- sklearn/impute/_base.py | 20 ++-- sklearn/neighbors/_graph.py | 18 ++-- sklearn/pipeline.py | 51 +++++++--- sklearn/preprocessing/_data.py | 20 +++- sklearn/preprocessing/_encoders.py | 23 ++++- sklearn/tests/test_pipeline.py | 96 ++++++++++++------- sklearn/utils/_feature_names.py | 7 +- .../utils/tests/test_make_feature_names.py | 18 ++++ 29 files changed, 316 insertions(+), 145 deletions(-) create mode 100644 sklearn/utils/tests/test_make_feature_names.py diff --git a/doc/glossary.rst b/doc/glossary.rst index 0bf01063d75a7..7cf344bee80e4 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -868,6 +868,7 @@ Class APIs and Estimator Types * :term:`fit` * :term:`transform` * :term:`get_feature_names` + * :term:`get_output_names` meta-estimator meta-estimators @@ -1236,6 +1237,14 @@ Methods to the names of input columns from which output column names can be generated. By default input features are named x0, x1, .... + ``get_output_names`` + Primarily for :term:`feature extractors`, but also used for other + transformers to provide string names for each column in the output of + the estimator's :term:`transform` method. It outputs a list of + strings and may take a list of strings as input, corresponding + to the names of input columns from which output column names can + be generated. By default input features are named x0, x1, .... + ``get_n_splits`` On a :term:`CV splitter` (not an estimator), returns the number of elements one would get if iterating through the return value of diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 3f64844836e17..fc9415311ef37 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -139,7 +139,7 @@ or by name:: >>> pipe['reduce_dim'] PCA() -To enable model inspection, `Pipeline` has an ``get_feature_names()`` method, +To enable model inspection, `Pipeline` has an ``get_output_names()`` method, just like all transformers. You can use pipeline slicing to get the feature names going into each step:: @@ -151,13 +151,13 @@ going into each step:: ... ('clf', LogisticRegression())]) >>> pipe.fit(iris.data, iris.target) Pipeline(steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))]) - >>> pipe[:-1].get_feature_names() + >>> pipe[:-1].get_output_names() array(['x2', 'x3'], dtype='>> pipe[:-1].get_feature_names(iris.feature_names) + >>> pipe[:-1].get_output_names(iris.feature_names) array(['petal length (cm)', 'petal width (cm)'], dtype='>> column_trans.get_feature_names() + >>> column_trans.get_output_names() ['categories__city_London', 'categories__city_Paris', 'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his', 'title_bow__how', 'title_bow__last', diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst index e074d93fad7b8..f4900a102bbd2 100644 --- a/doc/modules/feature_extraction.rst +++ b/doc/modules/feature_extraction.rst @@ -53,7 +53,7 @@ is a traditional numerical feature:: [ 0., 1., 0., 12.], [ 0., 0., 1., 18.]]) - >>> vec.get_feature_names() + >>> vec.get_output_names() ['city=Dubai', 'city=London', 'city=San Francisco', 'temperature'] :class:`DictVectorizer` accepts multiple string values for one @@ -69,7 +69,7 @@ and its year of release. array([[0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 2.003e+03], [1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.011e+03], [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.974e+03]]) - >>> vec.get_feature_names() == ['category=animation', 'category=drama', + >>> vec.get_output_names() == ['category=animation', 'category=drama', ... 'category=family', 'category=thriller', ... 'year'] True @@ -111,7 +111,7 @@ suitable for feeding into a classifier (maybe after being piped into a with 6 stored elements in Compressed Sparse ... format> >>> pos_vectorized.toarray() array([[1., 1., 1., 1., 1., 1.]]) - >>> vec.get_feature_names() + >>> vec.get_output_names() ['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the'] As you can imagine, if one extracts such a context around each individual @@ -340,7 +340,7 @@ Each term found by the analyzer during the fit is assigned a unique integer index corresponding to a column in the resulting matrix. This interpretation of the columns can be retrieved as follows:: - >>> vectorizer.get_feature_names() == ( + >>> vectorizer.get_output_names() == ( ... ['and', 'document', 'first', 'is', 'one', ... 'second', 'the', 'third', 'this']) True @@ -406,8 +406,8 @@ however, similar words are useful for prediction, such as in classifying writing style or personality. There are several known issues in our provided 'english' stop word list. It -does not aim to be a general, 'one-size-fits-all' solution as some tasks -may require a more custom solution. See [NQY18]_ for more details. +does not aim to be a general, 'one-size-fits-all' solution as some tasks +may require a more custom solution. See [NQY18]_ for more details. Please take care in choosing a stop word list. Popular stop word lists may include words that are highly informative to @@ -742,7 +742,7 @@ decide better:: >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2)) >>> counts = ngram_vectorizer.fit_transform(['words', 'wprds']) - >>> ngram_vectorizer.get_feature_names() == ( + >>> ngram_vectorizer.get_output_names() == ( ... [' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp']) True >>> counts.toarray().astype(int) @@ -758,7 +758,7 @@ span across words:: >>> ngram_vectorizer.fit_transform(['jumpy fox']) <1x4 sparse matrix of type '<... 'numpy.int64'>' with 4 stored elements in Compressed Sparse ... format> - >>> ngram_vectorizer.get_feature_names() == ( + >>> ngram_vectorizer.get_output_names() == ( ... [' fox ', ' jump', 'jumpy', 'umpy ']) True @@ -766,7 +766,7 @@ span across words:: >>> ngram_vectorizer.fit_transform(['jumpy fox']) <1x5 sparse matrix of type '<... 'numpy.int64'>' with 5 stored elements in Compressed Sparse ... format> - >>> ngram_vectorizer.get_feature_names() == ( + >>> ngram_vectorizer.get_output_names() == ( ... ['jumpy', 'mpy f', 'py fo', 'umpy ', 'y fox']) True diff --git a/examples/applications/plot_topics_extraction_with_nmf_lda.py b/examples/applications/plot_topics_extraction_with_nmf_lda.py index 95e4ebadc512b..5d0b6ef7c629d 100644 --- a/examples/applications/plot_topics_extraction_with_nmf_lda.py +++ b/examples/applications/plot_topics_extraction_with_nmf_lda.py @@ -103,7 +103,7 @@ def plot_top_words(model, feature_names, n_top_words, title): print("done in %0.3fs." % (time() - t0)) -tfidf_feature_names = tfidf_vectorizer.get_feature_names() +tfidf_feature_names = tfidf_vectorizer.get_output_names() plot_top_words(nmf, tfidf_feature_names, n_top_words, 'Topics in NMF model (Frobenius norm)') @@ -117,7 +117,7 @@ def plot_top_words(model, feature_names, n_top_words, title): l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) -tfidf_feature_names = tfidf_vectorizer.get_feature_names() +tfidf_feature_names = tfidf_vectorizer.get_output_names() plot_top_words(nmf, tfidf_feature_names, n_top_words, 'Topics in NMF model (generalized Kullback-Leibler divergence)') @@ -132,5 +132,5 @@ def plot_top_words(model, feature_names, n_top_words, title): lda.fit(tf) print("done in %0.3fs." % (time() - t0)) -tf_feature_names = tf_vectorizer.get_feature_names() +tf_feature_names = tf_vectorizer.get_output_names() plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model') diff --git a/examples/bicluster/plot_bicluster_newsgroups.py b/examples/bicluster/plot_bicluster_newsgroups.py index 250c22e78f796..e4c019ff2c84e 100644 --- a/examples/bicluster/plot_bicluster_newsgroups.py +++ b/examples/bicluster/plot_bicluster_newsgroups.py @@ -89,7 +89,7 @@ def build_tokenizer(self): time() - start_time, v_measure_score(y_kmeans, y_true))) -feature_names = vectorizer.get_feature_names() +feature_names = vectorizer.get_output_names() document_names = list(newsgroups.target_names[i] for i in newsgroups.target) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 1291a83dce05f..ba2de8cd551db 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -186,7 +186,7 @@ cv_coefs = np.concatenate([cv_pipeline[-1].coef_ for cv_pipeline in cv_results["estimator"]]) fig, ax = plt.subplots() -ax.barh(clf[:-1].get_feature_names(), +ax.barh(clf[:-1].get_output_names(), cv_coefs.mean(axis=0), xerr=cv_coefs.std(axis=0)) plt.tight_layout() plt.show() diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py index 158f777ae573c..69e5196ff6ba3 100644 --- a/examples/feature_selection/plot_feature_selection_pipeline.py +++ b/examples/feature_selection/plot_feature_selection_pipeline.py @@ -39,5 +39,5 @@ # access and plot the coefficients of the fitted model plt.barh((0, 1, 2), anova_svm[-1].coef_.ravel()) -plt.yticks((0, 1, 2), anova_svm[:-1].get_feature_names()) +plt.yticks((0, 1, 2), anova_svm[:-1].get_output_names()) plt.show() diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 93a5b430a3542..72cca2bff0b43 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -208,7 +208,7 @@ feature_names = (model.named_steps['columntransformer'] .named_transformers_['onehotencoder'] - .get_feature_names(input_features=categorical_columns)) + .get_output_names(input_features=categorical_columns)) feature_names = np.concatenate( [feature_names, numerical_columns]) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index d708aa0fd6756..6a0f8a6ba995d 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -124,7 +124,7 @@ ohe = (rf.named_steps['preprocess'] .named_transformers_['cat'] .named_steps['onehot']) -feature_names = ohe.get_feature_names(input_features=categorical_columns) +feature_names = ohe.get_output_names(input_features=categorical_columns) feature_names = np.r_[feature_names, numerical_columns] tree_feature_importances = ( diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py index 1fe4d1624e8b8..7ce06d47c730c 100644 --- a/examples/text/plot_document_classification_20newsgroups.py +++ b/examples/text/plot_document_classification_20newsgroups.py @@ -174,7 +174,7 @@ def size_mb(docs): if opts.use_hashing: feature_names = None else: - feature_names = vectorizer.get_feature_names() + feature_names = vectorizer.get_output_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py index bfcb7e6a5acf4..12e28b9b09de7 100644 --- a/examples/text/plot_document_clustering.py +++ b/examples/text/plot_document_clustering.py @@ -217,7 +217,7 @@ def is_interactive(): else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] - terms = vectorizer.get_feature_names() + terms = vectorizer.get_output_names() for i in range(true_k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: diff --git a/examples/text/plot_hashing_vs_dict_vectorizer.py b/examples/text/plot_hashing_vs_dict_vectorizer.py index 34673990c15be..964f500e8bef8 100644 --- a/examples/text/plot_hashing_vs_dict_vectorizer.py +++ b/examples/text/plot_hashing_vs_dict_vectorizer.py @@ -89,7 +89,7 @@ def token_freqs(doc): vectorizer.fit_transform(token_freqs(d) for d in raw_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) -print("Found %d unique terms" % len(vectorizer.get_feature_names())) +print("Found %d unique terms" % len(vectorizer.get_output_names())) print() print("FeatureHasher on frequency dicts") diff --git a/sklearn/base.py b/sklearn/base.py index 728a6d0fbaed0..4430ae0d3eb0c 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -699,12 +699,12 @@ def fit_transform(self, X, y=None, **fit_params): # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X) - def get_feature_names(self, input_features=None): + def get_output_names(self, input_features=None): """Get output feature names. Parameters ---------- - input_features : list of string or None + input_features : list of string or None, default=None String names of the input features. Returns @@ -787,30 +787,30 @@ def fit_predict(self, X, y=None): return self.fit(X).predict(X) -class OneToOneMixin(object): +class OneToOneMixin: """Provides get_feature_names for simple transformers Assumes there's a 1-to-1 correspondence between input features and output features. """ - def get_feature_names(self, input_features=None): - """Get feature names for transformation. + def get_output_names(self, input_features=None): + """Get output feature names for transformation. Returns input_features as this transformation doesn't add or drop features. Parameters ---------- - input_features : array-like of string - Input feature names. + input_features : array-like of str or None, default=None + Input features. If None, they are generated as + x0, x1, ..., xn_features. Returns ------- - feature_names : array-like of string - Transformed feature names + feature_names : array-like of str + Transformed feature names. """ - return _make_feature_names(self.n_features_in_, input_features=input_features) diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 15a87a4b2d00d..fda06a87b59cd 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -657,17 +657,17 @@ def _global_clustering(self, X=None): if compute_labels: self.labels_ = self.predict(X) - def get_feature_names(self, input_features=None): - """Get output feature names. + def get_output_names(self, input_features=None): + """Get output feature names for transformation. Parameters ---------- - input_features : list of string or None - String names of the input features. + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. Returns ------- - output_feature_names : list of string + output_feature_names : list of str Feature names for transformer output. """ return _make_feature_names( diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 2338834060caa..f95f769488805 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1159,17 +1159,17 @@ def score(self, X, y=None, sample_weight=None): return -_labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[1] - def get_feature_names(self, input_features=None): - """Get output feature names. + def get_output_names(self, input_features=None): + """Get output feature names for transformation. Parameters ---------- - input_features : list of string or None - String names of the input features. + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. Returns ------- - output_feature_names : list of string + output_feature_names : list of str Feature names for transformer output. """ return _make_feature_names( @@ -1897,17 +1897,17 @@ def predict(self, X, sample_weight=None): X = self._check_test_data(X) return self._labels_inertia_minibatch(X, sample_weight)[0] - def get_feature_names(self, input_features=None): - """Get output feature names. + def get_output_names(self, input_features=None): + """Get output feature names for transformation. Parameters ---------- - input_features : list of string or None - String names of the input features. + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. Returns ------- - output_feature_names : list of string + output_feature_names : list of str Feature names for transformer output. """ return _make_feature_names( diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 803a0efb05c43..b8c0c5622b8a2 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -386,6 +386,45 @@ def get_feature_names(self): more_names]) return feature_names + def get_output_names(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Transformed feature names. + """ + check_is_fitted(self) + feature_names = [] + for name, trans, column, _ in self._iter(fitted=True): + if trans == 'drop' or ( + hasattr(column, '__len__') and not len(column)): + continue + if trans == 'passthrough': + if hasattr(self, '_df_columns'): + if ((not isinstance(column, slice)) + and all(isinstance(col, str) for col in column)): + feature_names.extend(column) + else: + feature_names.extend(self._df_columns[column]) + else: + indices = np.arange(self._n_features) + feature_names.extend(['x%d' % i for i in indices[column]]) + continue + if not hasattr(trans, 'get_output_names'): + raise AttributeError("Transformer %s (type %s) does not " + "provide get_output_names." + % (str(name), type(trans).__name__)) + more_names = trans.get_output_names(input_features=column) + feature_names.extend([name + "__" + f for f in + more_names]) + return feature_names + def _update_fitted_transformers(self, transformers): # transformers are fitted; excludes 'drop' cases fitted_transformers = iter(transformers) diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index b6d779199d392..85e93d3148b1d 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -657,17 +657,17 @@ def decision_function(self, X): # Only override for the doc return super().decision_function(X) - def get_feature_names(self, input_features=None): - """Get output feature names. + def get_output_names(self, input_features=None): + """Get output feature names for transformation. Parameters ---------- - input_features : list of string or None - String names of the input features. + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. Returns ------- - output_feature_names : list of string + output_feature_names : list of str Feature names for transformer output. """ n_components = self.n_components or np.inf diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index d04c2ee5b06c6..81fc319fdfadb 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -2374,12 +2374,3 @@ def transform(self, X): """ check_is_fitted(self) return self.one_hot_encoder_.transform(self.apply(X)) - - def get_feature_names(self, input_features=None): - """Feature names - not implemented yet. - - Parameters - ---------- - input_features : list of strings or None - """ - return None diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index e0516407c205a..376e2f6637b8a 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -376,6 +376,21 @@ def get_feature_names(self): """ return self.feature_names_ + def get_output_names(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return self.feature_names_ + def restrict(self, support, indices=False): """Restrict the features to those in support using feature selection. diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 7d5ea4fc4b126..8c25598bca1e0 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1283,9 +1283,22 @@ def get_feature_names(self): feature_names : list A list of feature names. """ + return self.get_output_names() - self._check_vocabulary() + def get_output_names(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + self._check_vocabulary() return [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))] diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 08415e2ca4731..c0bb1dba61f8a 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -126,14 +126,19 @@ def inverse_transform(self, X): Xt[:, support] = X return Xt - def get_feature_names(self, input_features=None): + def get_output_names(self, input_features=None): """Mask feature names according to selected features. Parameters ---------- - input_features : list of string or None - Input features to select from. If none, they are generated as + input_features : list of str or None, default=None + Input features to select from. If None, they are generated as x0, x1, ..., xn. + + Returns + ------- + output_feature_names : ndarray of str + Feature names for transformer output. """ mask = self.get_support() input_features = _make_feature_names(mask.shape[0], diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index a5763f1c64ae4..cba599e522557 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -494,18 +494,18 @@ def transform(self, X): def _more_tags(self): return {'allow_nan': True} - def get_feature_names(self, input_features=None): - """Get feature names for transformation. + def get_output_names(self, input_features=None): + """Get output feature names for transformation. Parameters ---------- - input_features : array-like of string + input_features : array-like of str Input feature names. Returns ------- - feature_names : array-like of string - Transformed feature names + feature_names : ndarray of str + Transformed feature names. """ check_is_fitted(self, 'statistics_') input_features = _make_feature_names(self.statistics_.shape[0], @@ -871,17 +871,17 @@ def _more_tags(self): "preserves_dtype": [], } - def get_feature_names(self, input_features=None): - """Get output feature names. + def get_output_names(self, input_features=None): + """Get output feature names for transformation. Parameters ---------- - input_features : list of string or None - String names of the input features. + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. Returns ------- - output_feature_names : list of string + output_feature_names : list of str Feature names for transformer output. """ return _make_feature_names( diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index 01835ab81b08b..287a8c7304d48 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -371,17 +371,17 @@ def fit_transform(self, X, y=None): """ return self.fit(X).transform(X) - def get_feature_names(self, input_features=None): - """Get output feature names. + def get_output_names(self, input_features=None): + """Get output feature names for transformation. Parameters ---------- - input_features : list of string or None - String names of the input features. + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. Returns ------- - output_feature_names : list of string + output_feature_names : list of str Feature names for transformer output. """ return _make_feature_names( @@ -562,17 +562,17 @@ def fit_transform(self, X, y=None): """ return self.fit(X).transform(X) - def get_feature_names(self, input_features=None): + def get_output_names(self, input_features=None): """Get output feature names. Parameters ---------- - input_features : list of string or None - String names of the input features. + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. Returns ------- - output_feature_names : list of string + output_feature_names : list of str Feature names for transformer output. """ return _make_feature_names( diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 6ac48ee5e7146..0ba67d0fce881 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -19,7 +19,7 @@ from .base import clone, TransformerMixin from .utils._estimator_html_repr import _VisualBlock from .utils.metaestimators import if_delegate_has_method -from .utils import Bunch, _print_elapsed_time +from .utils import Bunch, _print_elapsed_time, deprecated from .utils.validation import check_memory from .utils.validation import _deprecate_positional_args from .utils.fixes import delayed @@ -625,8 +625,8 @@ def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) - def get_feature_names(self, input_features=None): - """Get feature names for transformation. + def get_output_names(self, input_features=None): + """Get output feature names for transformation. Transform input features using the pipeline. If the last step is a transformer, it's included @@ -634,8 +634,8 @@ def get_feature_names(self, input_features=None): Parameters ---------- - input_features : array-like of string - Input feature names. + input_features : array-like of str or None, default=None + Input features. Returns ------- @@ -643,17 +643,14 @@ def get_feature_names(self, input_features=None): Transformed feature names """ feature_names = input_features - for i, name, transform in self._iter(): - if not hasattr(transform, "get_feature_names"): + for _, name, transform in self._iter(): + if not hasattr(transform, "get_output_names"): raise TypeError( - "Estimator {} does provide get_feature_names. " - "Did you mean to call Pipeline[:-1].get_feature_names" + "Estimator {} does provide get_output_names. " + "Did you mean to call Pipeline[:-1].get_output_names" "()?".format(name)) - try: - feature_names = transform.get_feature_names( - input_features=feature_names) - except TypeError: - feature_names = transform.get_feature_names() + feature_names = transform.get_output_names( + input_features=feature_names) return feature_names @property @@ -935,6 +932,8 @@ def _iter(self): for name, trans in self.transformer_list if trans != 'drop') + @deprecated("get_feature_names is deprecated in 0.24 and will be removed " + "in 0.26. You can use get_output_names instead") def get_feature_names(self): """Get feature names from all transformers. @@ -953,6 +952,30 @@ def get_feature_names(self): trans.get_feature_names()]) return feature_names + def get_output_names(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + Returns + ------- + output_feature_names : list of str + Transformed feature names. + """ + feature_names = [] + for name, trans, _ in self._iter(): + if not hasattr(trans, 'get_output_names'): + raise AttributeError("Transformer %s (type %s) does not " + "provide get_output_names." + % (str(name), type(trans).__name__)) + feature_names.extend( + [name + "__" + f for f in + trans.get_output_names(input_features=input_features)]) + return feature_names + def fit(self, X, y=None, **fit_params): """Fit all transformers using X. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index ae4c53c2c4660..a1177eca52436 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -23,6 +23,7 @@ from ..utils import check_array from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var +from ..utils._feature_names import _make_feature_names from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2) from ..utils.sparsefuncs import (inplace_column_scale, @@ -1630,9 +1631,24 @@ def get_feature_names(self, input_features=None): ------- output_feature_names : list of str of shape (n_output_features,) """ + return self.get_output_names(input_features=input_features) + + def get_output_names(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. If None, they are generated as + x0, x1, ..., xn_features. + + Returns + ------- + feature_names : array-like of str + Transformed feature names. + """ powers = self.powers_ - if input_features is None: - input_features = ['x%d' % i for i in range(powers.shape[1])] + input_features = _make_feature_names(n_features=powers.shape[1]) feature_names = [] for row in powers: inds = np.where(row)[0] diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index a1f762110032f..356a94068c1b8 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -573,6 +573,26 @@ def get_feature_names(self, input_features=None): output_feature_names : ndarray of shape (n_output_features,) Array of feature names. """ + feature_names = self.get_output_names(input_features=input_features) + return np.array(feature_names, dtype=object) + + def get_output_names(self, input_features=None): + """Get output feature names for transformation. + + Returns input_features as this transformation + doesn't add or drop features. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. If None, they are generated as + x0, x1, ..., xn_features. + + Returns + ------- + feature_names : array-like of str + Transformed feature names. + """ check_is_fitted(self) cats = self.categories_ if input_features is None: @@ -590,8 +610,7 @@ def get_feature_names(self, input_features=None): if self.drop_idx_ is not None and self.drop_idx_[i] is not None: names.pop(self.drop_idx_[i]) feature_names.extend(names) - - return np.array(feature_names, dtype=object) + return feature_names class OrdinalEncoder(_BaseEncoder): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 9933f0592858b..1595bbd844ddb 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -858,12 +858,16 @@ def test_feature_union_parallel(): ) -def test_feature_union_feature_names(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_feature_union_feature_names(get_names): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) - feature_names = ft.get_feature_names() + feature_names = getattr(ft, get_names)() for feat in feature_names: assert "chars__" in feat or "words__" in feat assert len(feature_names) == 35 @@ -871,7 +875,7 @@ def test_feature_union_feature_names(): ft = FeatureUnion([("tr1", Transf())]).fit([[1]]) assert_raise_message(AttributeError, 'Transformer tr1 (type Transf) does not provide ' - 'get_feature_names', ft.get_feature_names) + f'{get_names}', getattr(ft, get_names)) def test_classes_property(): @@ -888,58 +892,78 @@ def test_classes_property(): assert_array_equal(clf.classes_, np.unique(y)) -def test_set_feature_union_steps(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_set_feature_union_steps(get_names): mult2 = Mult(2) - mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) - mult3.get_feature_names = lambda: ['x3'] mult5 = Mult(5) - mult5.get_feature_names = lambda: ['x5'] + + if get_names == "get_feature_names": + mult3.get_feature_names = lambda: ['x3'] + mult2.get_feature_names = lambda: ['x2'] + mult5.get_feature_names = lambda: ['x5'] + else: # get_output_names + mult3.get_output_names = lambda input_features: ['x3'] + mult2.get_output_names = lambda input_features: ['x2'] + mult5.get_output_names = lambda input_features: ['x5'] ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) - assert ['m2__x2', 'm3__x3'] == ft.get_feature_names() + assert ['m2__x2', 'm3__x3'] == getattr(ft, get_names)() # Directly setting attr ft.transformer_list = [('m5', mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) - assert ['m5__x5'] == ft.get_feature_names() + assert ['m5__x5'] == getattr(ft, get_names)() # Using set_params ft.set_params(transformer_list=[('mock', mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) - assert ['mock__x3'] == ft.get_feature_names() + assert ['mock__x3'] == getattr(ft, get_names)() # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) - assert ['mock__x5'] == ft.get_feature_names() + assert ['mock__x5'] == getattr(ft, get_names)() -def test_set_feature_union_step_drop(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_set_feature_union_step_drop(get_names): mult2 = Mult(2) - mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) - mult3.get_feature_names = lambda: ['x3'] + + if get_names == "get_feature_names": + mult2.get_feature_names = lambda: ['x2'] + mult3.get_feature_names = lambda: ['x3'] + else: # get_output_names + mult2.get_output_names = lambda input_features: ['x2'] + mult3.get_output_names = lambda input_features: ['x3'] + X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) - assert ['m2__x2', 'm3__x3'] == ft.get_feature_names() + assert ['m2__x2', 'm3__x3'] == getattr(ft, get_names)() with pytest.warns(None) as record: ft.set_params(m2='drop') assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) - assert ['m3__x3'] == ft.get_feature_names() + assert ['m3__x3'] == getattr(ft, get_names)() assert not record with pytest.warns(None) as record: ft.set_params(m3='drop') assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) - assert [] == ft.get_feature_names() + assert [] == getattr(ft, get_names)() assert not record with pytest.warns(None) as record: @@ -953,7 +977,7 @@ def test_set_feature_union_step_drop(): ft = FeatureUnion([('m2', 'drop'), ('m3', mult3)]) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) - assert ['m3__x3'] == ft.get_feature_names() + assert ['m3__x3'] == getattr(ft, get_names)() assert not record @@ -1124,20 +1148,20 @@ def test_feature_names_basic(): ('select', SelectKBest(k=2)), ('clf', LogisticRegression())]) with pytest.raises(NotFittedError): - pipe.get_feature_names() + pipe.get_output_names() iris = load_iris() pipe.fit(iris.data, iris.target) xs = np.array(['x0', 'x1', 'x2', 'x3']) - assert_array_equal(pipe[:1].get_feature_names(), xs) + assert_array_equal(pipe[:1].get_output_names(), xs) mask = pipe.named_steps.select.get_support() - assert_array_equal(pipe[:-1].get_feature_names(), xs[mask]) + assert_array_equal(pipe[:-1].get_output_names(), xs[mask]) with pytest.raises( TypeError, - match="Estimator clf does provide get_feature_names."): - pipe.get_feature_names(iris.feature_names) - assert_array_equal(pipe[:1].get_feature_names(iris.feature_names), + match="Estimator clf does provide get_output_names."): + pipe.get_output_names(iris.feature_names) + assert_array_equal(pipe[:1].get_output_names(iris.feature_names), iris.feature_names) - assert_array_equal(pipe[:-1].get_feature_names(iris.feature_names), + assert_array_equal(pipe[:-1].get_output_names(iris.feature_names), np.array(iris.feature_names)[mask]) pipe = Pipeline(steps=[ ('scaler', StandardScaler()), @@ -1145,9 +1169,9 @@ def test_feature_names_basic(): ('select', SelectKBest(k=2)), ('clf', LogisticRegression())]) pipe.fit(iris.data, iris.target) - assert_array_equal(pipe[:-1].get_feature_names(), ['pca0', 'pca1']) + assert_array_equal(pipe[:-1].get_output_names(), ['pca0', 'pca1']) # setting names doesn't change names after PCA - assert_array_equal(pipe[:-2].get_feature_names(iris.feature_names), + assert_array_equal(pipe[:-2].get_output_names(iris.feature_names), ['pca0', 'pca1', 'pca2']) @@ -1163,7 +1187,7 @@ def test_input_feature_names_pandas(): pipe.fit(df, iris.target) mask = pipe.named_steps.select.get_support() # for now assuming we have to pass these explicitly - assert_array_equal(pipe[:-1].get_feature_names(iris.feature_names), + assert_array_equal(pipe[:-1].get_output_names(iris.feature_names), np.array(iris.feature_names)[mask]) @@ -1176,8 +1200,8 @@ def test_features_names_passthrough(): iris = load_iris() pipe.fit(iris.data, iris.target) xs = ['x0', 'x1', 'x2', 'x3'] - assert_array_equal(pipe[:-1].get_feature_names(), xs) - assert_array_equal(pipe[:-1].get_feature_names(iris.feature_names), + assert_array_equal(pipe[:-1].get_output_names(), xs) + assert_array_equal(pipe[:-1].get_output_names(iris.feature_names), iris.feature_names) @@ -1187,9 +1211,9 @@ def test_feature_names_count_vectorizer(): ('clf', LogisticRegression())]) y = ["pizza" in x for x in JUNK_FOOD_DOCS] pipe.fit(JUNK_FOOD_DOCS, y) - assert_array_equal(pipe[:-1].get_feature_names(), + assert_array_equal(pipe[:-1].get_output_names(), ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) - assert_array_equal(pipe[:-1].get_feature_names("nonsense_is_ignored"), + assert_array_equal(pipe[:-1].get_output_names("nonsense_is_ignored"), ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) @@ -1202,9 +1226,9 @@ def test_feature_names_nested(): xs = np.array(['x0', 'x1', 'x2', 'x3']) mask = pipe.named_steps.inner_pipe.named_steps.select.get_support() assert_array_equal( - pipe.named_steps.inner_pipe[:1].get_feature_names(), xs[mask]) + pipe.named_steps.inner_pipe[:1].get_output_names(), xs[mask]) assert_array_equal( - pipe.named_steps.inner_pipe[:1].get_feature_names(iris.feature_names), + pipe.named_steps.inner_pipe[:1].get_output_names(iris.feature_names), np.array(iris.feature_names)[mask]) @@ -1218,8 +1242,8 @@ def test_feature_names_meta_pipe(): # check 0ths estimator in OVR only inner_pipe = pipe['ovr'].estimators_[0] mask = inner_pipe['select'].get_support() - assert_array_equal(inner_pipe[:-1].get_feature_names(), xs[mask]) - assert_array_equal(inner_pipe[:-1].get_feature_names(iris.feature_names), + assert_array_equal(inner_pipe[:-1].get_output_names(), xs[mask]) + assert_array_equal(inner_pipe[:-1].get_output_names(iris.feature_names), np.array(iris.feature_names)[mask]) diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py index 402fda2fe17cb..e8738d458f58f 100644 --- a/sklearn/utils/_feature_names.py +++ b/sklearn/utils/_feature_names.py @@ -11,15 +11,14 @@ def _make_feature_names(n_features, prefix='x', input_features=None): Number of feature names to generate prefix : str, default='x' Prefix for each feature name. - input_features : array-like of string + input_features : array-like of str Optional existing input features, returned unchanged if not None. Returns ------- - feature_names : list of str + feature_names : array-like of str Generated feature names of length n_features. """ if input_features is not None: return input_features - return ["{}{}".format(prefix, i) - for i in range(n_features)] + return [f"{prefix}{i}" for i in range(n_features)] diff --git a/sklearn/utils/tests/test_make_feature_names.py b/sklearn/utils/tests/test_make_feature_names.py new file mode 100644 index 0000000000000..852299349a57d --- /dev/null +++ b/sklearn/utils/tests/test_make_feature_names.py @@ -0,0 +1,18 @@ +import pytest +from numpy.testing import assert_array_equal +from sklearn.utils._feature_names import _make_feature_names + + +@pytest.mark.parametrize( + "n_features, prefix, input_features, expected_names", + [ + (3, 'x', None, ['x0', 'x1', 'x2']), + (4, 'x', ['cat', 'dog', 'snake'], ['cat', 'dog', 'snake']), + (4, 'pca', None, ['pca0', 'pca1', 'pca2', 'pca3']) + ]) +def test_make_feature_names(n_features, prefix, input_features, + expected_names): + feature_names = _make_feature_names(n_features=n_features, + prefix=prefix, + input_features=input_features) + assert_array_equal(expected_names, feature_names) From 4e0968ceab0210f0a5be41a3a14bdefb4adfe6dd Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 10:50:00 -0400 Subject: [PATCH 026/100] CLN Move deprecations --- sklearn/base.py | 17 ++--- sklearn/cluster/_agglomerative.py | 17 +++++ sklearn/compose/_column_transformer.py | 3 + .../compose/tests/test_column_transformer.py | 71 ++++++++++++------- sklearn/datasets/descr/twenty_newsgroups.rst | 4 +- .../feature_extraction/_dict_vectorizer.py | 4 +- .../tests/test_dict_vectorizer.py | 24 +++++-- sklearn/feature_extraction/tests/test_text.py | 48 +++++++++---- sklearn/feature_extraction/text.py | 6 +- sklearn/preprocessing/_encoders.py | 2 +- sklearn/preprocessing/tests/test_data.py | 12 ++-- sklearn/preprocessing/tests/test_encoders.py | 42 +++++++---- sklearn/random_projection.py | 17 +++++ sklearn/tests/test_pipeline.py | 11 +++ sklearn/utils/estimator_checks.py | 4 +- 15 files changed, 198 insertions(+), 84 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 4430ae0d3eb0c..578e3716c628b 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -704,28 +704,19 @@ def get_output_names(self, input_features=None): Parameters ---------- - input_features : list of string or None, default=None - String names of the input features. + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. Returns ------- - output_feature_names : list of string + output_feature_names : list of str Feature names for transformer output. """ # generate feature names from class name by default # would be much less guessing if we stored the number # of output features. # Ideally this would be done in each class. - if hasattr(self, 'n_clusters'): - # this is before n_components_ - # because n_components_ means something else - # in agglomerative clustering - n_features = self.n_clusters - elif hasattr(self, '_max_components'): - # special case for LinearDiscriminantAnalysis - n_components = self.n_components or np.inf - n_features = min(self._max_components, n_components) - elif hasattr(self, 'n_components_'): + if hasattr(self, 'n_components_'): # n_components could be auto or None # this is more likely to be an int n_features = self.n_components_ diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 66342797e33b5..a0bd1724b0768 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -20,6 +20,7 @@ from ..neighbors._dist_metrics import METRIC_MAPPING from ..utils import check_array from ..utils._fast_dict import IntFloatDict +from ..utils._feature_names import _make_feature_names from ..utils.fixes import _astype_copy_false from ..utils.validation import _deprecate_positional_args, check_memory # mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast' @@ -945,6 +946,22 @@ def fit_predict(self, X, y=None): """ return super().fit_predict(X, y) + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.n_clusters, + prefix=type(self).__name__.lower()) + class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): """Agglomerate features. diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index b8c0c5622b8a2..a46560412f85b 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -25,6 +25,7 @@ from ..utils.metaestimators import _BaseComposition from ..utils.validation import check_array, check_is_fitted from ..utils.validation import _deprecate_positional_args +from ..utils.deprecation import deprecated from ..utils.fixes import delayed @@ -349,6 +350,8 @@ def named_transformers_(self): return Bunch(**{name: trans for name, trans, _ in self.transformers_}) + @deprecated("get_feature_names is deprecated in 0.24 and will be removed " + "in 0.26. You can use get_output_names instead") def get_feature_names(self): """Get feature names from all transformers. diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 584842d3990d6..c5ff037cb5f64 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -662,17 +662,21 @@ def test_column_transformer_cloning(): assert hasattr(ct.transformers_[0][1], 'mean_') -def test_column_transformer_get_feature_names(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_column_transformer_get_feature_names(get_names): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans', Trans(), [0, 1])]) # raise correct error when not fitted with pytest.raises(NotFittedError): - ct.get_feature_names() + getattr(ct, get_names)() # raise correct error when no feature names are available ct.fit(X_array) assert_raise_message(AttributeError, "Transformer trans (type Trans) does not provide " - "get_feature_names", ct.get_feature_names) + f"{get_names}", getattr(ct, get_names)) # if some transformers support and some don't ct = ColumnTransformer([('trans', Trans(), [0, 1]), @@ -680,7 +684,7 @@ def test_column_transformer_get_feature_names(): ct.fit(X_array) assert_raise_message(AttributeError, "Transformer trans (type Trans) does not provide " - "get_feature_names", ct.get_feature_names) + f"{get_names}", getattr(ct, get_names)) # inside a pipeline make_pipeline(ct).fit(X_array) @@ -691,46 +695,50 @@ def test_column_transformer_get_feature_names(): ct = ColumnTransformer( [('col' + str(i), DictVectorizer(), i) for i in range(2)]) ct.fit(X) - assert ct.get_feature_names() == ['col0__a', 'col0__b', 'col1__c'] + assert getattr(ct, get_names)() == ['col0__a', 'col0__b', 'col1__c'] # drop transformer ct = ColumnTransformer( [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)]) ct.fit(X) - assert ct.get_feature_names() == ['col0__a', 'col0__b'] + assert getattr(ct, get_names)() == ['col0__a', 'col0__b'] # passthrough transformer ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) ct.fit(X) - assert ct.get_feature_names() == ['x0', 'x1'] + assert getattr(ct, get_names)() == ['x0', 'x1'] ct = ColumnTransformer([('trans', DictVectorizer(), 0)], remainder='passthrough') ct.fit(X) - assert ct.get_feature_names() == ['trans__a', 'trans__b', 'x1'] + assert getattr(ct, get_names)() == ['trans__a', 'trans__b', 'x1'] ct = ColumnTransformer([('trans', 'passthrough', [1])], remainder='passthrough') ct.fit(X) - assert ct.get_feature_names() == ['x1', 'x0'] + assert getattr(ct, get_names)() == ['x1', 'x0'] ct = ColumnTransformer([('trans', 'passthrough', lambda x: [1])], remainder='passthrough') ct.fit(X) - assert ct.get_feature_names() == ['x1', 'x0'] + assert getattr(ct, get_names)() == ['x1', 'x0'] ct = ColumnTransformer([('trans', 'passthrough', np.array([False, True]))], remainder='passthrough') ct.fit(X) - assert ct.get_feature_names() == ['x1', 'x0'] + assert getattr(ct, get_names)() == ['x1', 'x0'] ct = ColumnTransformer([('trans', 'passthrough', slice(1, 2))], remainder='passthrough') ct.fit(X) - assert ct.get_feature_names() == ['x1', 'x0'] + assert getattr(ct, get_names)() == ['x1', 'x0'] -def test_column_transformer_get_feature_names_dataframe(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_column_transformer_get_feature_names_dataframe(get_names): # passthough transformer with a dataframe pd = pytest.importorskip('pandas') X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}], @@ -739,42 +747,42 @@ def test_column_transformer_get_feature_names_dataframe(): ct = ColumnTransformer([('trans', 'passthrough', ['col0', 'col1'])]) ct.fit(X_df) - assert ct.get_feature_names() == ['col0', 'col1'] + assert getattr(ct, get_names)() == ['col0', 'col1'] ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) ct.fit(X_df) - assert ct.get_feature_names() == ['col0', 'col1'] + assert getattr(ct, get_names)() == ['col0', 'col1'] ct = ColumnTransformer([('col0', DictVectorizer(), 0)], remainder='passthrough') ct.fit(X_df) - assert ct.get_feature_names() == ['col0__a', 'col0__b', 'col1'] + assert getattr(ct, get_names)() == ['col0__a', 'col0__b', 'col1'] ct = ColumnTransformer([('trans', 'passthrough', ['col1'])], remainder='passthrough') ct.fit(X_df) - assert ct.get_feature_names() == ['col1', 'col0'] + assert getattr(ct, get_names)() == ['col1', 'col0'] ct = ColumnTransformer([('trans', 'passthrough', lambda x: x[['col1']].columns)], remainder='passthrough') ct.fit(X_df) - assert ct.get_feature_names() == ['col1', 'col0'] + assert getattr(ct, get_names)() == ['col1', 'col0'] ct = ColumnTransformer([('trans', 'passthrough', np.array([False, True]))], remainder='passthrough') ct.fit(X_df) - assert ct.get_feature_names() == ['col1', 'col0'] + assert getattr(ct, get_names)() == ['col1', 'col0'] ct = ColumnTransformer([('trans', 'passthrough', slice(1, 2))], remainder='passthrough') ct.fit(X_df) - assert ct.get_feature_names() == ['col1', 'col0'] + assert getattr(ct, get_names)() == ['col1', 'col0'] ct = ColumnTransformer([('trans', 'passthrough', [1])], remainder='passthrough') ct.fit(X_df) - assert ct.get_feature_names() == ['col1', 'col0'] + assert getattr(ct, get_names)() == ['col1', 'col0'] def test_column_transformer_special_strings(): @@ -1375,11 +1383,15 @@ def test_make_column_selector_pickle(): assert_array_equal(selector(X_df), selector_picked(X_df)) +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) @pytest.mark.parametrize( 'empty_col', [[], np.array([], dtype=int), lambda x: []], ids=['list', 'array', 'callable'] ) -def test_feature_names_empty_columns(empty_col): +def test_feature_names_empty_columns(empty_col, get_names): pd = pytest.importorskip('pandas') df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) @@ -1392,8 +1404,8 @@ def test_feature_names_empty_columns(empty_col): ) ct.fit(df) - assert ct.get_feature_names() == ['ohe__col1_a', 'ohe__col1_b', - 'ohe__col2_z'] + assert getattr(ct, get_names)() == ['ohe__col1_a', 'ohe__col1_b', + 'ohe__col2_z'] @pytest.mark.parametrize('remainder', ["passthrough", StandardScaler()]) @@ -1446,3 +1458,14 @@ def test_sk_visual_block_remainder_fitted_numpy(remainder): assert visual_block.names == ('scale', 'remainder') assert visual_block.name_details == ([0, 2], [1]) assert visual_block.estimators == (scaler, remainder) + + +# TODO: Remove in 0.26 when get_feature_names is removed +def test_column_transformers_get_feature_names_deprecated(): + X = np.array([[0, 1], [2, 4]]) + ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) + ct.fit(X) + + msg = "get_feature_names is deprecated in 0.24" + with pytest.warns(FutureWarning, match=msg): + ct.get_feature_names() diff --git a/sklearn/datasets/descr/twenty_newsgroups.rst b/sklearn/datasets/descr/twenty_newsgroups.rst index 6318028b8afa3..40f5d319acc63 100644 --- a/sklearn/datasets/descr/twenty_newsgroups.rst +++ b/sklearn/datasets/descr/twenty_newsgroups.rst @@ -116,7 +116,7 @@ components by sample in a more than 30000-dimensional space >>> vectors.nnz / float(vectors.shape[0]) 159.01327... -:func:`sklearn.datasets.fetch_20newsgroups_vectorized` is a function which +:func:`sklearn.datasets.fetch_20newsgroups_vectorized` is a function which returns ready-to-use token counts features instead of file names. .. _`20 newsgroups website`: http://people.csail.mit.edu/jrennie/20Newsgroups/ @@ -156,7 +156,7 @@ Let's take a look at what the most informative features are: >>> import numpy as np >>> def show_top10(classifier, vectorizer, categories): - ... feature_names = np.asarray(vectorizer.get_feature_names()) + ... feature_names = np.asarray(vectorizer.get_output_names()) ... for i, category in enumerate(categories): ... top10 = np.argsort(classifier.coef_[i])[-10:] ... print("%s: %s" % (category, " ".join(feature_names[top10]))) diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 376e2f6637b8a..23ebaabb1a6b3 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -416,11 +416,11 @@ def restrict(self, support, indices=False): >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] >>> X = v.fit_transform(D) >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1]) - >>> v.get_feature_names() + >>> v.get_output_names() ['bar', 'baz', 'foo'] >>> v.restrict(support.get_support()) DictVectorizer() - >>> v.get_feature_names() + >>> v.get_output_names() ['bar', 'foo'] """ if not indices: diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 519201b580598..32df3a7555203 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -43,7 +43,11 @@ def test_dictvectorizer(sparse, dtype, sort, iterable): sorted(v.feature_names_)) -def test_feature_selection(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_feature_selection(get_names): # make two feature dicts with two useful features and a bunch of useless # ones, in terms of chi2 d1 = dict([("useless%d" % i, 10) for i in range(20)], @@ -57,10 +61,14 @@ def test_feature_selection(): sel = SelectKBest(chi2, k=2).fit(X, [0, 1]) v.restrict(sel.get_support(indices=indices), indices=indices) - assert v.get_feature_names() == ["useful1", "useful2"] + assert getattr(v, get_names)() == ["useful1", "useful2"] -def test_one_of_k(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_one_of_k(get_names): D_in = [{"version": "1", "ham": 2}, {"version": "2", "spam": .3}, {"version=3": True, "spam": -1}] @@ -71,12 +79,16 @@ def test_one_of_k(): D_out = v.inverse_transform(X) assert D_out[0] == {"version=1": 1, "ham": 2} - names = v.get_feature_names() + names = getattr(v, get_names)() assert "version=2" in names assert "version" not in names -def test_iterable_value(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_iterable_value(get_names): D_names = ['ham', 'spam', 'version=1', 'version=2', 'version=3'] X_expected = [[2.0, 0.0, 2.0, 1.0, 0.0], [0.0, 0.3, 0.0, 1.0, 0.0], @@ -92,7 +104,7 @@ def test_iterable_value(): D_out = v.inverse_transform(X) assert D_out[0] == {"version=1": 2, "version=2": 1, "ham": 2} - names = v.get_feature_names() + names = getattr(v, get_names)() assert names == D_names diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 733785476a66c..e7e8d52cbb6d2 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -560,12 +560,16 @@ def test_hashing_vectorizer(): assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0) -def test_feature_names(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_feature_names(get_names): cv = CountVectorizer(max_df=0.5) # test for Value error on unfitted/empty vocabulary with pytest.raises(ValueError): - cv.get_feature_names() + getattr(cv, get_names)() assert not cv.fixed_vocabulary_ # test for vocabulary learned from data @@ -573,7 +577,7 @@ def test_feature_names(): n_samples, n_features = X.shape assert len(cv.vocabulary_) == n_features - feature_names = cv.get_feature_names() + feature_names = getattr(cv, get_names)() assert len(feature_names) == n_features assert_array_equal(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water'], @@ -587,7 +591,7 @@ def test_feature_names(): 'salad', 'sparkling', 'tomato', 'water'] cv = CountVectorizer(vocabulary=vocab) - feature_names = cv.get_feature_names() + feature_names = getattr(cv, get_names)() assert_array_equal(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water'], feature_names) assert cv.fixed_vocabulary_ @@ -609,7 +613,11 @@ def test_vectorizer_max_features(Vectorizer): assert vectorizer.stop_words_ == expected_stop_words -def test_count_vectorizer_max_features(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_count_vectorizer_max_features(get_names): # Regression test: max_features didn't work correctly in 0.14. cv_1 = CountVectorizer(max_features=1) @@ -620,9 +628,9 @@ def test_count_vectorizer_max_features(): counts_3 = cv_3.fit_transform(JUNK_FOOD_DOCS).sum(axis=0) counts_None = cv_None.fit_transform(JUNK_FOOD_DOCS).sum(axis=0) - features_1 = cv_1.get_feature_names() - features_3 = cv_3.get_feature_names() - features_None = cv_None.get_feature_names() + features_1 = getattr(cv_1, get_names)() + features_3 = getattr(cv_3, get_names)() + features_None = getattr(cv_None, get_names)() # The most common feature is "the", with frequency 7. assert 7 == counts_1.max() @@ -681,12 +689,16 @@ def test_vectorizer_min_df(): assert len(vect.stop_words_) == 5 -def test_count_binary_occurrences(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_count_binary_occurrences(get_names): # by default multiple occurrences are counted as longs test_data = ['aaabc', 'abbde'] vect = CountVectorizer(analyzer='char', max_df=1.0) X = vect.fit_transform(test_data).toarray() - assert_array_equal(['a', 'b', 'c', 'd', 'e'], vect.get_feature_names()) + assert_array_equal(['a', 'b', 'c', 'd', 'e'], getattr(vect, get_names)()) assert_array_equal([[3, 1, 1, 0, 0], [1, 2, 0, 1, 1]], X) @@ -921,7 +933,11 @@ def test_pickling_built_processors(factory): assert result == expected -def test_countvectorizer_vocab_sets_when_pickling(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_countvectorizer_vocab_sets_when_pickling(get_names): # ensure that vocabulary of type set is coerced to a list to # preserve iteration ordering after deserialization rng = np.random.RandomState(0) @@ -933,10 +949,14 @@ def test_countvectorizer_vocab_sets_when_pickling(): unpickled_cv = pickle.loads(pickle.dumps(cv)) cv.fit(ALL_FOOD_DOCS) unpickled_cv.fit(ALL_FOOD_DOCS) - assert cv.get_feature_names() == unpickled_cv.get_feature_names() + assert getattr(cv, get_names)() == getattr(unpickled_cv, get_names)() -def test_countvectorizer_vocab_dicts_when_pickling(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_countvectorizer_vocab_dicts_when_pickling(get_names): rng = np.random.RandomState(0) vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water']) @@ -949,7 +969,7 @@ def test_countvectorizer_vocab_dicts_when_pickling(): unpickled_cv = pickle.loads(pickle.dumps(cv)) cv.fit(ALL_FOOD_DOCS) unpickled_cv.fit(ALL_FOOD_DOCS) - assert cv.get_feature_names() == unpickled_cv.get_feature_names() + assert getattr(cv, get_names)() == getattr(unpickled_cv, get_names)() def test_stop_words_removal(): diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 8c25598bca1e0..59fcfac832f36 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -963,7 +963,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): ... ] >>> vectorizer = CountVectorizer() >>> X = vectorizer.fit_transform(corpus) - >>> print(vectorizer.get_feature_names()) + >>> print(vectorizer.get_output_names()) ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] >>> print(X.toarray()) [[0 1 1 1 0 0 1 0 1] @@ -972,7 +972,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): [0 1 1 1 0 0 1 0 1]] >>> vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2)) >>> X2 = vectorizer2.fit_transform(corpus) - >>> print(vectorizer2.get_feature_names()) + >>> print(vectorizer2.get_output_names()) ['and this', 'document is', 'first document', 'is the', 'is this', 'second document', 'the first', 'the second', 'the third', 'third one', 'this document', 'this is', 'this the'] @@ -1719,7 +1719,7 @@ class TfidfVectorizer(CountVectorizer): ... ] >>> vectorizer = TfidfVectorizer() >>> X = vectorizer.fit_transform(corpus) - >>> print(vectorizer.get_feature_names()) + >>> print(vectorizer.get_output_names()) ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] >>> print(X.shape) (4, 9) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 356a94068c1b8..1537f2b5e3837 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -274,7 +274,7 @@ class OneHotEncoder(_BaseEncoder): >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]]) array([['Male', 1], [None, 2]], dtype=object) - >>> enc.get_feature_names(['gender', 'group']) + >>> enc.get_output_names(['gender', 'group']) array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], dtype=object) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index c09acf1591532..ebb2ac467079e 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -125,23 +125,27 @@ def test_polynomial_features(): interact.n_input_features_) -def test_polynomial_feature_names(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_polynomial_feature_names(get_names): X = np.arange(30).reshape(10, 3) poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) - feature_names = poly.get_feature_names() + feature_names = getattr(poly, get_names)() assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1', 'x0 x2', 'x1^2', 'x1 x2', 'x2^2'], feature_names) poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) - feature_names = poly.get_feature_names(["a", "b", "c"]) + feature_names = getattr(poly, get_names)(["a", "b", "c"]) assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2', 'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c', 'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c', 'b c^2', 'c^3'], feature_names) # test some unicode poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) - feature_names = poly.get_feature_names( + feature_names = getattr(poly, get_names)( ["\u0001F40D", "\u262E", "\u05D0"]) assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], feature_names) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index f030406e070fa..a6e34e1987f2f 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -132,7 +132,11 @@ def test_one_hot_encoder_dtype_pandas(output_dtype): assert_array_equal(oh.fit(X_df).transform(X_df), X_expected) -def test_one_hot_encoder_feature_names(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_one_hot_encoder_feature_names(get_names): enc = OneHotEncoder() X = [['Male', 1, 'girl', 2, 3], ['Female', 41, 'girl', 1, 10], @@ -140,7 +144,7 @@ def test_one_hot_encoder_feature_names(): ['Male', 91, 'girl', 21, 30]] enc.fit(X) - feature_names = enc.get_feature_names() + feature_names = getattr(enc, get_names)() assert isinstance(feature_names, np.ndarray) assert_array_equal(['x0_Female', 'x0_Male', @@ -150,8 +154,8 @@ def test_one_hot_encoder_feature_names(): 'x4_3', 'x4_10', 'x4_30'], feature_names) - feature_names2 = enc.get_feature_names(['one', 'two', - 'three', 'four', 'five']) + feature_names2 = getattr(enc, get_names)(['one', 'two', + 'three', 'four', 'five']) assert_array_equal(['one_Female', 'one_Male', 'two_1', 'two_41', 'two_51', 'two_91', @@ -160,16 +164,20 @@ def test_one_hot_encoder_feature_names(): 'five_3', 'five_10', 'five_30'], feature_names2) with pytest.raises(ValueError, match="input_features should have length"): - enc.get_feature_names(['one', 'two']) + getattr(enc, get_names)(['one', 'two']) -def test_one_hot_encoder_feature_names_unicode(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) +def test_one_hot_encoder_feature_names_unicode(get_names): enc = OneHotEncoder() X = np.array([['c❤t1', 'dat2']], dtype=object).T enc.fit(X) - feature_names = enc.get_feature_names() + feature_names = getattr(enc, get_names)() assert_array_equal(['x0_c❤t1', 'x0_dat2'], feature_names) - feature_names = enc.get_feature_names(input_features=['n👍me']) + feature_names = getattr(enc, get_names)(input_features=['n👍me']) assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names) @@ -269,20 +277,24 @@ def test_one_hot_encoder_inverse_if_binary(): # check that resetting drop option without refitting does not throw an error +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) @pytest.mark.parametrize('drop', ['if_binary', 'first', None]) @pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None]) -def test_one_hot_encoder_drop_reset(drop, reset_drop): +def test_one_hot_encoder_drop_reset(get_names, drop, reset_drop): X = np.array([['Male', 1], ['Female', 3], ['Female', 2]], dtype=object) ohe = OneHotEncoder(drop=drop, sparse=False) ohe.fit(X) X_tr = ohe.transform(X) - feature_names = ohe.get_feature_names() + feature_names = getattr(ohe, get_names)() ohe.set_params(drop=reset_drop) assert_array_equal(ohe.inverse_transform(X_tr), X) assert_allclose(ohe.transform(X), X_tr) - assert_array_equal(ohe.get_feature_names(), feature_names) + assert_array_equal(getattr(ohe, get_names)(), feature_names) @pytest.mark.parametrize("method", ['fit', 'fit_transform']) @@ -403,18 +415,22 @@ def test_one_hot_encoder_pandas(): assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]]) +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", + "get_output_names"]) @pytest.mark.parametrize("drop, expected_names", [('first', ['x0_c', 'x2_b']), ('if_binary', ['x0_c', 'x1_2', 'x2_b']), (['c', 2, 'b'], ['x0_b', 'x2_a'])], ids=['first', 'binary', 'manual']) -def test_one_hot_encoder_feature_names_drop(drop, expected_names): +def test_one_hot_encoder_feature_names_drop(get_names, drop, expected_names): X = [['c', 2, 'a'], ['b', 2, 'b']] ohe = OneHotEncoder(drop=drop) ohe.fit(X) - feature_names = ohe.get_feature_names() + feature_names = getattr(ohe, get_names)() assert isinstance(feature_names, np.ndarray) assert_array_equal(expected_names, feature_names) diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 4623ac1ab64e4..36da47c93b85d 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -36,6 +36,7 @@ from .base import BaseEstimator, TransformerMixin from .utils import check_random_state +from .utils._feature_names import _make_feature_names from .utils.extmath import safe_sparse_dot from .utils.random import sample_without_replacement from .utils.validation import check_array, check_is_fitted @@ -416,6 +417,22 @@ def transform(self, X): dense_output=self.dense_output) return X_new + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.n_components_, + prefix=type(self).__name__.lower()) + class GaussianRandomProjection(BaseRandomProjection): """Reduce dimensionality through Gaussian random projection. diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 1595bbd844ddb..83fbc25dbe8c0 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1397,3 +1397,14 @@ def test_feature_union_warns_unknown_transformer_weight(): union = FeatureUnion(transformer_list, transformer_weights=weights) with pytest.raises(ValueError, match=expected_msg): union.fit(X, y) + + +# TODO: Remove in 0.26 when get_feature_names is removed +def test_feature_union_get_feature_names_deprecated(): + msg = "get_feature_names is deprecated in 0.24" + mult2 = Mult(2) + mult2.get_feature_names = lambda: ['x2'] + + ft = FeatureUnion([('m2', mult2)]) + with pytest.warns(FutureWarning, match=msg): + ft.get_feature_names() diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 304862cd2abec..e24cf9aeb4e0f 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1334,8 +1334,8 @@ def _check_transformer(name, transformer_orig, X, y, strict_mode=True): X_pred = transformer_clone.fit_transform(X, y=y_) input_features = ['feature%d' % i for i in range(n_features)] - if hasattr(transformer_clone, 'get_feature_names'): - feature_names = transformer_clone.get_feature_names(input_features) + if hasattr(transformer_clone, 'get_output_names'): + feature_names = transformer_clone.get_output_names(input_features) if feature_names is not None: if isinstance(X_pred, tuple): assert len(feature_names) == X_pred[0].shape[1] From 95046a01737569af5c7cfb7434896ed23364682a Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 10:56:44 -0400 Subject: [PATCH 027/100] WIP Deprecates dictvect get_feature_names --- sklearn/feature_extraction/_dict_vectorizer.py | 3 +++ .../feature_extraction/tests/test_dict_vectorizer.py | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 23ebaabb1a6b3..4ac0df4e1325d 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -13,6 +13,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array, tosequence from ..utils.validation import _deprecate_positional_args +from ..utils.deprecation import deprecated def _tosequence(X): @@ -368,6 +369,8 @@ def transform(self, X): return Xa + @deprecated("get_feature_names is deprecated in 0.24 and will be removed " + "in 0.26. You can use get_output_names instead") def get_feature_names(self): """Returns a list of feature names, ordered by their indices. diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 32df3a7555203..2cf6994bade54 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -177,3 +177,14 @@ def test_n_features_in(): d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] dv.fit(d) assert not hasattr(dv, 'n_features_in_') + + +# TODO: Remove in 0.26 when get_feature_names is removed +def test_feature_union_get_feature_names_deprecated(): + D_in = [{"version": "1", "ham": 2}, + {"version": "2", "spam": .3}] + v = DictVectorizer().fit(D_in) + + msg = "get_feature_names is deprecated in 0.24" + with pytest.warns(FutureWarning, match=msg): + v.get_feature_names() From f7aa3fd773de482b4ebde36d181cb724ee4fa395 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 11:02:12 -0400 Subject: [PATCH 028/100] WIP Deprecates text get_feature_names --- sklearn/feature_extraction/tests/test_text.py | 8 ++++++++ sklearn/feature_extraction/text.py | 3 +++ 2 files changed, 11 insertions(+) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index e7e8d52cbb6d2..6ecb6fffb3ffd 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1368,3 +1368,11 @@ def test_tie_breaking_sample_order_invariance(): vocab1 = vec.fit(['hello', 'world']).vocabulary_ vocab2 = vec.fit(['world', 'hello']).vocabulary_ assert vocab1 == vocab2 + + +# TODO: Remove in 0.26 when get_feature_names is removed +def test_get_feature_names_deprecated(): + cv = CountVectorizer(max_df=0.5).fit(ALL_FOOD_DOCS) + msg = "get_feature_names is deprecated in 0.24" + with pytest.warns(FutureWarning, match=msg): + cv.get_feature_names() diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 59fcfac832f36..8497d4f18d846 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -30,6 +30,7 @@ from ._hash import FeatureHasher from ._stop_words import ENGLISH_STOP_WORDS from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES +from ..utils.deprecation import deprecated from ..utils import _IS_32BIT from ..utils.fixes import _astype_copy_false from ..exceptions import NotFittedError @@ -1275,6 +1276,8 @@ def inverse_transform(self, X): return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel() for i in range(n_samples)] + @deprecated("get_feature_names is deprecated in 0.24 and will be removed " + "in 0.26. You can use get_output_names instead") def get_feature_names(self): """Array mapping from feature integer indices to feature name. From f4a9882628b0f2dd0a42636d1b189d63c4a0b040 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 11:17:35 -0400 Subject: [PATCH 029/100] WIP Deprecates polyfeature.get_feature_names --- sklearn/pipeline.py | 2 +- sklearn/preprocessing/_data.py | 5 ++++- sklearn/preprocessing/_encoders.py | 2 +- sklearn/preprocessing/tests/test_data.py | 9 +++++++++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 0ba67d0fce881..e4ec39a9326f8 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -973,7 +973,7 @@ def get_output_names(self, input_features=None): % (str(name), type(trans).__name__)) feature_names.extend( [name + "__" + f for f in - trans.get_output_names(input_features=input_features)]) + trans.get_output_names(input_features)]) return feature_names def fit(self, X, y=None, **fit_params): diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index a1177eca52436..bddc197dbf524 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -31,6 +31,7 @@ min_max_axis) from ..utils.validation import (check_is_fitted, check_random_state, FLOAT_DTYPES, _deprecate_positional_args) +from ..utils.deprecation import deprecated from ._csr_polynomial_expansion import _csr_polynomial_expansion @@ -1617,6 +1618,8 @@ def powers_(self): return np.vstack([np.bincount(c, minlength=self.n_input_features_) for c in combinations]) + @deprecated("get_feature_names is deprecated in 0.24 and will be removed " + "in 0.26. You can use get_output_names instead") def get_feature_names(self, input_features=None): """ Return feature names for output features @@ -1631,7 +1634,7 @@ def get_feature_names(self, input_features=None): ------- output_feature_names : list of str of shape (n_output_features,) """ - return self.get_output_names(input_features=input_features) + return self.get_output_names(input_features) def get_output_names(self, input_features=None): """Get output feature names for transformation. diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 1537f2b5e3837..f359e3a0cd1f7 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -573,7 +573,7 @@ def get_feature_names(self, input_features=None): output_feature_names : ndarray of shape (n_output_features,) Array of feature names. """ - feature_names = self.get_output_names(input_features=input_features) + feature_names = self.get_output_names(input_features) return np.array(feature_names, dtype=object) def get_output_names(self, input_features=None): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index ebb2ac467079e..ebbdbbcb56569 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2546,3 +2546,12 @@ def test_minmax_scaler_clip(feature_range): X_transformed, [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]]) + + +# TODO: Remove in 0.26 when get_feature_names is removed +def test_get_feature_names_deprecated(): + X = np.arange(30).reshape(10, 3) + poly = PolynomialFeatures(degree=2, include_bias=False).fit(X) + msg = "get_feature_names is deprecated in 0.24" + with pytest.warns(FutureWarning, match=msg): + poly.get_feature_names() From fa4b318fb2898b4911734b72ee6acfbed46ac4ae Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 11:22:43 -0400 Subject: [PATCH 030/100] WIP Deprecates one hot encoder get_feature_names --- sklearn/preprocessing/_encoders.py | 3 +++ sklearn/preprocessing/tests/test_encoders.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index f359e3a0cd1f7..9378eebc0ec6a 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -8,6 +8,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array +from ..utils.deprecation import deprecated from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args @@ -558,6 +559,8 @@ def inverse_transform(self, X): return X_tr + @deprecated("get_feature_names is deprecated in 0.24 and will be removed " + "in 0.26. You can use get_output_names instead") def get_feature_names(self, input_features=None): """ Return feature names for output features. diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index a6e34e1987f2f..4257ff2a3594c 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -773,3 +773,13 @@ def test_encoders_does_not_support_none_values(Encoder): with pytest.raises(TypeError, match="Encoders require their input to be " "uniformly strings or numbers."): Encoder().fit(values) + + +# TODO: Remove in 0.26 when get_feature_names is removed +def test_one_hot_encoder_get_feature_names_deprecated(): + X = np.array([['cat', 'dot']], dtype=object).T + enc = OneHotEncoder().fit(X) + + msg = "get_feature_names is deprecated in 0.24" + with pytest.warns(FutureWarning, match=msg): + enc.get_feature_names() From 640ad762b2e22078a67fb9be147680081fcad7d0 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 12:53:50 -0400 Subject: [PATCH 031/100] ENH Adds get_output_names to all transformers --- sklearn/base.py | 21 +++++++++++- sklearn/decomposition/_kernel_pca.py | 17 ++++++++++ sklearn/impute/_base.py | 6 ++-- sklearn/impute/_iterative.py | 21 ++++++++++++ sklearn/impute/_knn.py | 22 ++++++++++++- sklearn/kernel_approximation.py | 18 ++++++++++ sklearn/preprocessing/_data.py | 17 ++++++++++ sklearn/preprocessing/_discretization.py | 16 +++++++++ sklearn/tests/test_common.py | 18 ++++++++-- sklearn/utils/estimator_checks.py | 42 +++++++++++++++++++----- 10 files changed, 181 insertions(+), 17 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 578e3716c628b..a631e1536c6b3 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -720,7 +720,7 @@ def get_output_names(self, input_features=None): # n_components could be auto or None # this is more likely to be an int n_features = self.n_components_ - elif hasattr(self, 'components_'): + if hasattr(self, 'components_'): n_features = self.components_.shape[0] elif hasattr(self, 'n_components') and self.n_components is not None: n_features = self.n_components @@ -730,6 +730,25 @@ def get_output_names(self, input_features=None): prefix=type(self).__name__.lower()) +class _ComponentsMixin: + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + n_features = self.components_.shape[0] + return _make_feature_names(n_features=n_features, + prefix=type(self).__name__.lower()) + + class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" _estimator_type = "DensityEstimator" diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 63fcfda41ba7d..ae1bb43727ffe 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -15,6 +15,7 @@ from ..preprocessing import KernelCenterer from ..metrics.pairwise import pairwise_kernels from ..utils.validation import _deprecate_positional_args +from ..utils._feature_names import _make_feature_names class KernelPCA(TransformerMixin, BaseEstimator): @@ -364,5 +365,21 @@ def inverse_transform(self, X): K.flat[::n_samples + 1] += self.alpha return np.dot(K, self.dual_coef_) + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.lambdas_.shape[0], + prefix=type(self).__name__.lower()) + def _more_tags(self): return {'preserves_dtype': [np.float64, np.float32]} diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index cba599e522557..5c06e05bb73cb 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -504,13 +504,13 @@ def get_output_names(self, input_features=None): Returns ------- - feature_names : ndarray of str + feature_names : list of str Transformed feature names. """ - check_is_fitted(self, 'statistics_') + check_is_fitted(self) input_features = _make_feature_names(self.statistics_.shape[0], input_features=input_features) - return np.array(input_features)[self._valid_mask] + return np.array(input_features)[self._valid_mask].tolist() def inverse_transform(self, X): """Convert the data back to the original representation. diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index f27e5f2b05f8d..3ec901f2892d7 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -13,6 +13,7 @@ is_scalar_nan) from ..utils.validation import FLOAT_DTYPES, check_is_fitted from ..utils._mask import _get_mask +from ..utils._feature_names import _make_feature_names from ._base import _BaseImputer from ._base import SimpleImputer @@ -746,3 +747,23 @@ def fit(self, X, y=None): """ self.fit_transform(X) return self + + def get_output_names(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str + Input feature names. + + Returns + ------- + feature_names : list of str + Transformed feature names. + """ + check_is_fitted(self) + input_features = _make_feature_names( + self.initial_imputer_.statistics_.shape[0], + input_features=input_features) + return (np.array(input_features)[self.initial_imputer_._valid_mask] + .tolist()) diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index df66e4a20aff6..cad236ab08cbe 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -13,6 +13,7 @@ from ..utils import check_array from ..utils import is_scalar_nan from ..utils._mask import _get_mask +from ..utils._feature_names import _make_feature_names from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args @@ -188,6 +189,7 @@ def fit(self, X, y=None): _check_weights(self.weights) self._fit_X = X self._mask_fit_X = _get_mask(self._fit_X, self.missing_values) + self._valid_mask = ~np.all(self._mask_fit_X, axis=0) super()._fit_indicator(self._mask_fit_X) @@ -222,7 +224,7 @@ def transform(self, X): mask = _get_mask(X, self.missing_values) mask_fit_X = self._mask_fit_X - valid_mask = ~np.all(mask_fit_X, axis=0) + valid_mask = self._valid_mask X_indicator = super()._transform_indicator(mask) @@ -303,3 +305,21 @@ def process_chunk(dist_chunk, start): pass return super()._concatenate_indicator(X[:, valid_mask], X_indicator) + + def get_output_names(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str + Input feature names. + + Returns + ------- + feature_names : ndarray of str + Transformed feature names. + """ + check_is_fitted(self) + input_features = _make_feature_names(self._valid_mask.shape[0], + input_features=input_features) + return np.array(input_features)[self._valid_mask].tolist() diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 9c666272e2f5e..371aed316f7c1 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -21,6 +21,7 @@ from .base import BaseEstimator from .base import TransformerMixin from .utils import check_array, check_random_state, as_float_array +from .utils._feature_names import _make_feature_names from .utils.extmath import safe_sparse_dot from .utils.validation import check_is_fitted from .metrics.pairwise import pairwise_kernels, KERNEL_PARAMS @@ -620,6 +621,23 @@ def _transform_sparse(self, X): return sp.hstack(X_new) + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + n_features = self.n_features_in_ * (2 * self.sample_steps - 1) + return _make_feature_names(n_features=n_features, + prefix=type(self).__name__.lower()) + def _more_tags(self): return {'stateless': True, 'requires_positive_X': True} diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index bddc197dbf524..0c9835c968c46 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -32,6 +32,7 @@ from ..utils.validation import (check_is_fitted, check_random_state, FLOAT_DTYPES, _deprecate_positional_args) from ..utils.deprecation import deprecated +from ..utils._feature_names import _make_feature_names from ._csr_polynomial_expansion import _csr_polynomial_expansion @@ -2289,6 +2290,22 @@ def transform(self, K, copy=True): return K + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.K_fit_rows_.shape[0], + prefix=type(self).__name__.lower()) + @property def _pairwise(self): return True diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 0dddebbf2823c..f0e63e902ceb4 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -356,3 +356,19 @@ def inverse_transform(self, Xt): Xinv[:, jj] = bin_centers[np.int_(Xinv[:, jj])] return Xinv + + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. If None, they are generated as + x0, x1, ..., xn_features. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return self._encoder.get_feature_names(input_features) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index c41bdb1116a6c..7aa42d664d168 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -37,7 +37,8 @@ _set_checking_parameters, _get_check_estimator_ids, check_class_weight_balanced_linear_classifier, - parametrize_with_checks) + parametrize_with_checks, + check_transformer_get_output_names) def test_all_estimator_no_base_class(): @@ -66,8 +67,8 @@ def test_get_check_estimator_ids(val, expected): assert _get_check_estimator_ids(val) == expected -def _tested_estimators(): - for name, Estimator in all_estimators(): +def _tested_estimators(type_filter=None): + for name, Estimator in all_estimators(type_filter): if issubclass(Estimator, BiclusterMixin): continue try: @@ -268,3 +269,14 @@ def test_strict_mode_check_estimator(): def test_strict_mode_parametrize_with_checks(estimator, check): # Ideally we should assert that the strict checks are Xfailed... check(estimator) + + +@pytest.mark.parametrize( + "transformer", + [est for est in _tested_estimators('transformer') + if "2darray" in est._get_tags()["X_types"] and + not est._get_tags()["no_validation"]], + ids=_get_check_estimator_ids) +def test_transformers_get_output_names(transformer): + check_transformer_get_output_names(type(transformer).__name__, + transformer) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e24cf9aeb4e0f..77d85ddc49c95 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1333,15 +1333,6 @@ def _check_transformer(name, transformer_orig, X, y, strict_mode=True): transformer_clone = clone(transformer) X_pred = transformer_clone.fit_transform(X, y=y_) - input_features = ['feature%d' % i for i in range(n_features)] - if hasattr(transformer_clone, 'get_output_names'): - feature_names = transformer_clone.get_output_names(input_features) - if feature_names is not None: - if isinstance(X_pred, tuple): - assert len(feature_names) == X_pred[0].shape[1] - else: - assert len(feature_names) == X_pred.shape[1] - if isinstance(X_pred, tuple): for x_pred in X_pred: assert x_pred.shape[0] == n_samples @@ -3147,3 +3138,36 @@ def check_requires_y_none(name, estimator_orig, strict_mode=True): _FULLY_STRICT_CHECKS = set([ 'check_n_features_in', ]) + + +def check_transformer_get_output_names(name, transformer_orig, + strict_mode=True): + X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], + random_state=0, n_features=2, cluster_std=0.1) + X = StandardScaler().fit_transform(X) + X -= X.min() + X = _pairwise_estimator_convert_X(X, transformer_orig) + + n_samples, n_features = np.asarray(X).shape + transformer = clone(transformer_orig) + _set_checking_parameters(transformer) + set_random_state(transformer) + + y_ = y + if name in CROSS_DECOMPOSITION: + y_ = np.c_[np.asarray(y), np.asarray(y)] + y_[::2, 1] *= 2 + + X_pred = transformer.fit_transform(X, y=y_) + + input_features = ['feature%d' % i for i in range(n_features)] + feature_names = transformer.get_output_names(input_features) + assert feature_names is not None + if isinstance(X_pred, tuple): + assert len(feature_names) == X_pred[0].shape[1], ( + f"Expected {X_pred[0].shape[1]} feature names, got " + f"{len(feature_names)}") + else: + assert len(feature_names) == X_pred.shape[1], ( + f"Expected {X_pred.shape[1]} feature names, got " + f"{len(feature_names)}") From d9d2d95aa448430d30aa060fb5bdb09fd4bbf09b Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 13:19:02 -0400 Subject: [PATCH 032/100] ENH Add get_output_names everywhere --- sklearn/base.py | 49 ------------------ sklearn/cross_decomposition/_pls.py | 33 ++++++++++++ sklearn/decomposition/_base.py | 17 ++++++ sklearn/decomposition/_dict_learning.py | 34 ++++++++++++ sklearn/decomposition/_factor_analysis.py | 17 ++++++ sklearn/decomposition/_fastica.py | 17 ++++++ sklearn/decomposition/_lda.py | 17 ++++++ sklearn/decomposition/_nmf.py | 17 ++++++ sklearn/decomposition/_sparse_pca.py | 17 ++++++ sklearn/decomposition/_truncated_svd.py | 17 ++++++ sklearn/kernel_approximation.py | 63 +++++++++++++++++++++++ sklearn/manifold/_isomap.py | 18 +++++++ sklearn/manifold/_locally_linear.py | 17 ++++++ sklearn/neighbors/_nca.py | 17 ++++++ sklearn/neural_network/_rbm.py | 17 ++++++ 15 files changed, 318 insertions(+), 49 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index a631e1536c6b3..a944a09b8813b 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -699,55 +699,6 @@ def fit_transform(self, X, y=None, **fit_params): # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X) - def get_output_names(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - # generate feature names from class name by default - # would be much less guessing if we stored the number - # of output features. - # Ideally this would be done in each class. - if hasattr(self, 'n_components_'): - # n_components could be auto or None - # this is more likely to be an int - n_features = self.n_components_ - if hasattr(self, 'components_'): - n_features = self.components_.shape[0] - elif hasattr(self, 'n_components') and self.n_components is not None: - n_features = self.n_components - else: - return None - return _make_feature_names(n_features=n_features, - prefix=type(self).__name__.lower()) - - -class _ComponentsMixin: - def get_output_names(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - n_features = self.components_.shape[0] - return _make_feature_names(n_features=n_features, - prefix=type(self).__name__.lower()) - class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 9d8df42bf1a46..6379bbf4604f7 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -20,6 +20,7 @@ from ..utils.validation import _deprecate_positional_args from ..exceptions import ConvergenceWarning from ..utils.deprecation import deprecated +from ..utils._feature_names import _make_feature_names __all__ = ['PLSCanonical', 'PLSRegression', 'PLSSVD'] @@ -441,6 +442,22 @@ def _more_tags(self): return {'poor_score': True, 'requires_y': False} + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.n_components, + prefix=type(self).__name__.lower()) + class PLSRegression(_PLS): """PLS regression @@ -956,3 +973,19 @@ def fit_transform(self, X, y=None): `(X_transformed, Y_transformed)` otherwise. """ return self.fit(X, y).transform(X, y) + + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.n_components, + prefix=type(self).__name__.lower()) diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index e89a05051404b..11d55e90ba153 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -14,6 +14,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.validation import check_is_fitted +from ..utils._feature_names import _make_feature_names from abc import ABCMeta, abstractmethod @@ -157,3 +158,19 @@ def inverse_transform(self, X): self.components_) + self.mean_ else: return np.dot(X, self.components_) + self.mean_ + + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.components_.shape[0], + prefix=type(self).__name__.lower()) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 71cbfde40d1c6..dc16c50cf7997 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -20,6 +20,7 @@ from ..utils.extmath import randomized_svd, row_norms from ..utils.validation import check_is_fitted, _deprecate_positional_args from ..utils.fixes import delayed +from ..utils._feature_names import _make_feature_names from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars @@ -1357,6 +1358,22 @@ def fit(self, X, y=None): self.error_ = E return self + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.components_.shape[0], + prefix=type(self).__name__.lower()) + class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): """Mini-batch dictionary learning @@ -1648,3 +1665,20 @@ def partial_fit(self, X, y=None, iter_offset=None): self.inner_stats_ = (A, B) self.iter_offset_ = iter_offset + 1 return self + + + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.components_.shape[0], + prefix=type(self).__name__.lower()) diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index 76fecbe31598e..fceeec83ea6c9 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -29,6 +29,7 @@ from ..utils import check_array, check_random_state from ..utils.extmath import fast_logdet, randomized_svd, squared_norm from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils._feature_names import _make_feature_names from ..exceptions import ConvergenceWarning @@ -388,6 +389,22 @@ def _rotate(self, components, n_components=None, tol=1e-6): raise ValueError("'method' must be in %s, not %s" % (implemented, method)) + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.components_.shape[0], + prefix=type(self).__name__.lower()) + def _ortho_rotation(components, method='varimax', tol=1e-6, max_iter=100): """Return rotated components.""" diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index e2a4cd55058e7..7aa9a0c4a1d11 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -21,6 +21,7 @@ from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES from ..utils.validation import _deprecate_positional_args +from ..utils._feature_names import _make_feature_names __all__ = ['fastica', 'FastICA'] @@ -624,3 +625,19 @@ def inverse_transform(self, X, copy=True): X += self.mean_ return X + + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.components_.shape[0], + prefix=type(self).__name__.lower()) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 14dd87b9db130..d4d7d984acf68 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -22,6 +22,7 @@ from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed +from ..utils._feature_names import _make_feature_names from ._online_lda_fast import (mean_change, _dirichlet_expectation_1d, _dirichlet_expectation_2d) @@ -838,3 +839,19 @@ def perplexity(self, X, sub_sampling=False): Perplexity score. """ return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling) + + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.n_components, + prefix=type(self).__name__.lower()) diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 4eaf9c29e5703..3ec2f495927ab 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -17,6 +17,7 @@ from ..base import BaseEstimator, TransformerMixin from ..exceptions import ConvergenceWarning from ..utils import check_random_state, check_array +from ..utils._feature_names import _make_feature_names from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm from ..utils.validation import check_is_fitted, check_non_negative from ..utils.validation import _deprecate_positional_args @@ -1376,3 +1377,19 @@ def inverse_transform(self, W): """ check_is_fitted(self) return np.dot(W, self.components_) + + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.n_components_, + prefix=type(self).__name__.lower()) diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index b850182c24200..1498fab271cf1 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -7,6 +7,7 @@ from ..utils import check_random_state, check_array from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args +from ..utils._feature_names import _make_feature_names from ..linear_model import ridge_regression from ..base import BaseEstimator, TransformerMixin from ._dict_learning import dict_learning, dict_learning_online @@ -205,6 +206,22 @@ def transform(self, X): return U + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.n_components_, + prefix=type(self).__name__.lower()) + class MiniBatchSparsePCA(SparsePCA): """Mini-batch Sparse Principal Components Analysis diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 558dcbc69b38b..f8e549c382c61 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -12,6 +12,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array, check_random_state +from ..utils._feature_names import _make_feature_names from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip from ..utils.sparsefuncs import mean_variance_axis from ..utils.validation import _deprecate_positional_args @@ -235,3 +236,19 @@ def inverse_transform(self, X): def _more_tags(self): return {'preserves_dtype': [np.float64, np.float32]} + + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.components_.shape[0], + prefix=type(self).__name__.lower()) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 371aed316f7c1..5db7617ff5338 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -194,6 +194,22 @@ def transform(self, X): return data_sketch + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.n_components, + prefix=type(self).__name__.lower()) + class RBFSampler(TransformerMixin, BaseEstimator): """Approximates feature map of an RBF kernel by Monte Carlo approximation @@ -310,6 +326,22 @@ def transform(self, X): projection *= np.sqrt(2.) / np.sqrt(self.n_components) return projection + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.n_components, + prefix=type(self).__name__.lower()) + class SkewedChi2Sampler(TransformerMixin, BaseEstimator): """Approximates feature map of the "skewed chi-squared" kernel by Monte @@ -434,6 +466,21 @@ def transform(self, X): projection *= np.sqrt(2.) / np.sqrt(self.n_components) return projection + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.n_components, + prefix=type(self).__name__.lower()) class AdditiveChi2Sampler(TransformerMixin, BaseEstimator): """Approximate feature map for additive chi2 kernel. @@ -832,6 +879,22 @@ def _get_kernel_params(self): return params + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.components_.shape[0], + prefix=type(self).__name__.lower()) + def _more_tags(self): return { '_xfail_checks': { diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index d843c3ddd8462..34e634cb8d0b3 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -9,6 +9,7 @@ from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args from ..utils.graph import graph_shortest_path +from ..utils._feature_names import _make_feature_names from ..decomposition import KernelPCA from ..preprocessing import KernelCenterer @@ -270,3 +271,20 @@ def transform(self, X): G_X *= -0.5 return self.kernel_pca_.transform(G_X) + + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names( + n_features=self.kernel_pca_.lambdas_.shape[0], + prefix=type(self).__name__.lower()) diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index d9c02416bbb68..13cffa3b75289 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -15,6 +15,7 @@ from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES from ..utils.validation import _deprecate_positional_args +from ..utils._feature_names import _make_feature_names from ..neighbors import NearestNeighbors @@ -733,3 +734,19 @@ def transform(self, X): for i in range(X.shape[0]): X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i]) return X_new + + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.n_components, + prefix=type(self).__name__.lower()) diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 8920b2d99ed02..e0b73594d59a4 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -20,6 +20,7 @@ from ..base import BaseEstimator, TransformerMixin from ..preprocessing import LabelEncoder from ..decomposition import PCA +from ..utils._feature_names import _make_feature_names from ..utils.multiclass import check_classification_targets from ..utils.random import check_random_state from ..utils.validation import check_is_fitted, check_array, check_scalar @@ -523,5 +524,21 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): return sign * loss, sign * gradient.ravel() + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.components_.shape[0], + prefix=type(self).__name__.lower()) + def _more_tags(self): return {'requires_y': True} diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index d1028911f4185..e3a4cbc774776 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -21,6 +21,7 @@ from ..utils.extmath import safe_sparse_dot from ..utils.extmath import log_logistic from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils._feature_names import _make_feature_names class BernoulliRBM(TransformerMixin, BaseEstimator): @@ -385,3 +386,19 @@ def _more_tags(self): 'fails for the decision_function method' } } + + def get_output_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + output_feature_names : list of str + Feature names for transformer output. + """ + return _make_feature_names(n_features=self.components_.shape[0], + prefix=type(self).__name__.lower()) From f6075cafb45704d61fcc7de373c58e2822438c7d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 13:26:57 -0400 Subject: [PATCH 033/100] STY Lint fixes --- sklearn/decomposition/_dict_learning.py | 1 - sklearn/preprocessing/_data.py | 1 - 2 files changed, 2 deletions(-) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index dc16c50cf7997..5440a7265b74f 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1666,7 +1666,6 @@ def partial_fit(self, X, y=None, iter_offset=None): self.iter_offset_ = iter_offset + 1 return self - def get_output_names(self, input_features=None): """Get output feature names. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 0c9835c968c46..ee4d6225f63a0 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -23,7 +23,6 @@ from ..utils import check_array from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var -from ..utils._feature_names import _make_feature_names from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2) from ..utils.sparsefuncs import (inplace_column_scale, From 1af211c3aedfa13afbc86386339275489f6aa07f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 13:51:42 -0400 Subject: [PATCH 034/100] TST Adds test for missing indicator --- sklearn/impute/_base.py | 11 ++++++++--- sklearn/impute/_iterative.py | 10 ++++++++-- sklearn/impute/_knn.py | 8 +++++++- sklearn/impute/tests/test_common.py | 28 ++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 5c06e05bb73cb..c8feb8a6b9599 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -510,7 +510,13 @@ def get_output_names(self, input_features=None): check_is_fitted(self) input_features = _make_feature_names(self.statistics_.shape[0], input_features=input_features) - return np.array(input_features)[self._valid_mask].tolist() + output = np.array(input_features)[self._valid_mask].tolist() + if not self.add_indicator: + return output + missing_names = self.indicator_.get_output_names(input_features) + missing_names = [f'missingindicator__{name}' for name in + missing_names] + return output + missing_names def inverse_transform(self, X): """Convert the data back to the original representation. @@ -885,5 +891,4 @@ def get_output_names(self, input_features=None): Feature names for transformer output. """ return _make_feature_names( - n_features=len(self.features_), - prefix=type(self).__name__.lower()) + n_features=len(self.features_), input_features=input_features) diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 3ec901f2892d7..21e81b5932804 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -765,5 +765,11 @@ def get_output_names(self, input_features=None): input_features = _make_feature_names( self.initial_imputer_.statistics_.shape[0], input_features=input_features) - return (np.array(input_features)[self.initial_imputer_._valid_mask] - .tolist()) + output = (np.array(input_features)[self.initial_imputer_._valid_mask] + .tolist()) + if not self.add_indicator: + return output + missing_names = self.indicator_.get_output_names(input_features) + missing_names = [f'missingindicator__{name}' for name in + missing_names] + return output + missing_names diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index cad236ab08cbe..f5587701d972f 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -322,4 +322,10 @@ def get_output_names(self, input_features=None): check_is_fitted(self) input_features = _make_feature_names(self._valid_mask.shape[0], input_features=input_features) - return np.array(input_features)[self._valid_mask].tolist() + output = np.array(input_features)[self._valid_mask].tolist() + if not self.add_indicator: + return output + missing_names = self.indicator_.get_output_names(input_features) + missing_names = [f'missingindicator__{name}' for name in + missing_names] + return output + missing_names diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py index 220a335c15285..cf6a255327bc6 100644 --- a/sklearn/impute/tests/test_common.py +++ b/sklearn/impute/tests/test_common.py @@ -113,3 +113,31 @@ def test_imputers_pandas_na_integer_array_support(imputer, add_indicator): X_trans = imputer.fit_transform(X_df) assert_allclose(X_trans_expected, X_trans) + + +# ConvergenceWarning will be raised by the IterativeImputer +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +@pytest.mark.parametrize("marker", [np.nan, -1, 0]) +@pytest.mark.parametrize("imputer", IMPUTERS) +def test_imputers_marker(marker, imputer): + X = np.array([ + [marker, 1, 5, marker, 1], + [2, marker, 1, marker, 2], + [6, 3, marker, marker, 3], + [1, 2, 9, marker, 4] + ]) + imputer.set_params(missing_values=marker, add_indicator=False) + feature_names_in = [f'feat{i}' for i in range(X.shape[1])] + expected_features_out = ['feat0', 'feat1', 'feat2', 'feat4'] + imputer.fit(X) + assert_array_equal(imputer.get_output_names(feature_names_in), + expected_features_out) + + imputer.set_params(missing_values=marker, add_indicator=True) + expected_features_out = ( + ['feat0', 'feat1', 'feat2', 'feat4'] + + [f'missingindicator__feat{i}' for i in range(X.shape[1])]) + + imputer.fit(X) + assert_array_equal(imputer.get_output_names(feature_names_in), + expected_features_out) From 9ab0cf910c533ed1bbca4cef26f60d0e5a9e89d6 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 13:56:52 -0400 Subject: [PATCH 035/100] REV Revert changes --- sklearn/compose/_column_transformer.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index a46560412f85b..02faf1d093f3e 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -381,12 +381,9 @@ def get_feature_names(self): raise AttributeError("Transformer %s (type %s) does not " "provide get_feature_names." % (str(name), type(trans).__name__)) - try: - more_names = trans.get_feature_names(input_features=column) - except TypeError: - more_names = trans.get_feature_names() feature_names.extend([name + "__" + f for f in - more_names]) + trans.get_feature_names()]) + return feature_names return feature_names def get_output_names(self, input_features=None): @@ -423,9 +420,9 @@ def get_output_names(self, input_features=None): raise AttributeError("Transformer %s (type %s) does not " "provide get_output_names." % (str(name), type(trans).__name__)) - more_names = trans.get_output_names(input_features=column) - feature_names.extend([name + "__" + f for f in - more_names]) + feature_names.extend( + [name + "__" + f + for f in trans.get_output_names(input_features=column)]) return feature_names def _update_fitted_transformers(self, transformers): From 292649291ecda1eb940325b991adf8dfa9073543 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 14:02:33 -0400 Subject: [PATCH 036/100] TST Fixes --- sklearn/preprocessing/_data.py | 3 ++- sklearn/preprocessing/_encoders.py | 3 +-- sklearn/preprocessing/tests/test_encoders.py | 7 +++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index ee4d6225f63a0..5f8ecd06071d9 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -1651,7 +1651,8 @@ def get_output_names(self, input_features=None): Transformed feature names. """ powers = self.powers_ - input_features = _make_feature_names(n_features=powers.shape[1]) + input_features = _make_feature_names(n_features=powers.shape[1], + input_features=input_features) feature_names = [] for row in powers: inds = np.where(row)[0] diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 85cf808e20a81..b1c692dd1951a 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -276,8 +276,7 @@ class OneHotEncoder(_BaseEncoder): array([['Male', 1], [None, 2]], dtype=object) >>> enc.get_output_names(['gender', 'group']) - array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], - dtype=object) + ['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'] One can always drop the first column for each feature: diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 832c00379758f..f1bc6f10a9522 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -145,7 +145,9 @@ def test_one_hot_encoder_feature_names(get_names): enc.fit(X) feature_names = getattr(enc, get_names)() - assert isinstance(feature_names, np.ndarray) + + if get_names == 'get_feature_names': + assert isinstance(feature_names, np.ndarray) assert_array_equal(['x0_Female', 'x0_Male', 'x1_1', 'x1_41', 'x1_51', 'x1_91', @@ -431,7 +433,8 @@ def test_one_hot_encoder_feature_names_drop(get_names, drop, expected_names): ohe = OneHotEncoder(drop=drop) ohe.fit(X) feature_names = getattr(ohe, get_names)() - assert isinstance(feature_names, np.ndarray) + if get_names == 'get_feature_names': + assert isinstance(feature_names, np.ndarray) assert_array_equal(expected_names, feature_names) From c1a1778ad6d42c59d5ea65cfc5331e73b9af848d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 15:21:34 -0400 Subject: [PATCH 037/100] BUG Fixes missing indicator --- sklearn/compose/_column_transformer.py | 1 - .../compose/tests/test_column_transformer.py | 28 +++++++++++++++---- sklearn/impute/_base.py | 5 ++-- sklearn/impute/tests/test_impute.py | 13 +++++++++ 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 02faf1d093f3e..073526a579862 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -384,7 +384,6 @@ def get_feature_names(self): feature_names.extend([name + "__" + f for f in trans.get_feature_names()]) return feature_names - return feature_names def get_output_names(self, input_features=None): """Get output feature names for transformation. diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index c5ff037cb5f64..becfa84749820 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1385,13 +1385,31 @@ def test_make_column_selector_pickle(): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) @pytest.mark.parametrize( 'empty_col', [[], np.array([], dtype=int), lambda x: []], ids=['list', 'array', 'callable'] ) -def test_feature_names_empty_columns(empty_col, get_names): +def test_feature_names_empty_columns(empty_col): + pd = pytest.importorskip('pandas') + + df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) + + ct = ColumnTransformer( + transformers=[ + ("ohe", OneHotEncoder(), ["col1", "col2"]), + ("empty_features", OneHotEncoder(), empty_col), + ], + ) + + ct.fit(df) + assert ct.get_feature_names() == ['ohe__x0_a', 'ohe__x0_b', 'ohe__x1_z'] + + +@pytest.mark.parametrize( + 'empty_col', [[], np.array([], dtype=int), lambda x: []], + ids=['list', 'array', 'callable'] +) +def test_output_names_empty_columns(empty_col): pd = pytest.importorskip('pandas') df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) @@ -1404,8 +1422,8 @@ def test_feature_names_empty_columns(empty_col, get_names): ) ct.fit(df) - assert getattr(ct, get_names)() == ['ohe__col1_a', 'ohe__col1_b', - 'ohe__col2_z'] + assert ct.get_output_names() == ['ohe__col1_a', 'ohe__col1_b', + 'ohe__col2_z'] @pytest.mark.parametrize('remainder', ["passthrough", StandardScaler()]) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index c8feb8a6b9599..aa527bb48f160 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -890,5 +890,6 @@ def get_output_names(self, input_features=None): output_feature_names : list of str Feature names for transformer output. """ - return _make_feature_names( - n_features=len(self.features_), input_features=input_features) + names = _make_feature_names( + n_features=self._n_features, input_features=input_features) + return [names[i] for i in self.features_] diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py index 4c0918b9a3230..d25abc481ad5b 100644 --- a/sklearn/impute/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -1474,3 +1474,16 @@ def test_simple_imputation_inverse_transform_exceptions(missing_value): with pytest.raises(ValueError, match=f"Got 'add_indicator={imputer.add_indicator}'"): imputer.inverse_transform(X_1_trans) + + +@pytest.mark.parametrize( + "features, expected_names", + [("missing-only", ["feat0", "feat2"]), + ("all", ["feat0", "feat1", "feat2"])]) +def test_missing_indicator_get_output_names(features, expected_names): + # output names are correct for missing indicator + X = np.array([[1, 0, np.nan], + [np.nan, 1, 1]]) + indicator = MissingIndicator(features=features).fit(X) + assert_array_equal(indicator.get_output_names(["feat0", "feat1", "feat2"]), + expected_names) From 37101b093785c4bf998120db3ca12eccfbede58a Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Sep 2020 16:08:35 -0400 Subject: [PATCH 038/100] TST Fixes test --- sklearn/impute/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py index cf6a255327bc6..06bc4938e5a61 100644 --- a/sklearn/impute/tests/test_common.py +++ b/sklearn/impute/tests/test_common.py @@ -136,7 +136,7 @@ def test_imputers_marker(marker, imputer): imputer.set_params(missing_values=marker, add_indicator=True) expected_features_out = ( ['feat0', 'feat1', 'feat2', 'feat4'] + - [f'missingindicator__feat{i}' for i in range(X.shape[1])]) + [f'missingindicator__feat{i}' for i in [0, 1, 2, 3]]) imputer.fit(X) assert_array_equal(imputer.get_output_names(feature_names_in), From adcc1c1166f32292148570678b08bbd2b496b91f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Sep 2020 10:28:57 -0400 Subject: [PATCH 039/100] TST Adds test filtering --- sklearn/base.py | 2 +- sklearn/impute/_base.py | 4 ++-- sklearn/tests/test_common.py | 37 ++++++++++++++++++++++++++++++------ 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index a944a09b8813b..55a11e3958571 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -749,7 +749,7 @@ def fit_predict(self, X, y=None): class OneToOneMixin: - """Provides get_feature_names for simple transformers + """Provides get_output_names for simple transformers Assumes there's a 1-to-1 correspondence between input features and output features. diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index aa527bb48f160..629237da6cb81 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -890,6 +890,6 @@ def get_output_names(self, input_features=None): output_feature_names : list of str Feature names for transformer output. """ - names = _make_feature_names( - n_features=self._n_features, input_features=input_features) + names = _make_feature_names(n_features=self._n_features, + input_features=input_features) return [names[i] for i in self.features_] diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 7aa42d664d168..e8a5589042033 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -271,12 +271,37 @@ def test_strict_mode_parametrize_with_checks(estimator, check): check(estimator) -@pytest.mark.parametrize( - "transformer", - [est for est in _tested_estimators('transformer') - if "2darray" in est._get_tags()["X_types"] and - not est._get_tags()["no_validation"]], - ids=_get_check_estimator_ids) +# TODO: As more modules support get_output_names they should be removed from +# this list to be tested +GET_OUTPUT_NAMES_MODULES_TO_IGNORE = [ + 'cluster', + 'compose', + 'cross_decomposition', + 'decomposition', + 'discriminant_analysis', + 'ensemble', + 'feature_extraction', + 'feature_selection', + 'impute', + 'isotonic', + 'kernel_approximation', + 'manifold', + 'neighbors', + 'neural_network', + 'pipeline', + 'preprocessing', + 'random_projection' +] + +GET_OUTPUT_NAMES_ESTIMATORS = [ + est for est in _tested_estimators('transformer') + if "2darray" in est._get_tags()["X_types"] and + not est._get_tags()["no_validation"] and + est.__module__.split('.')[1] not in GET_OUTPUT_NAMES_MODULES_TO_IGNORE +] + + +@pytest.mark.parametrize("transformer", GET_OUTPUT_NAMES_ESTIMATORS) def test_transformers_get_output_names(transformer): check_transformer_get_output_names(type(transformer).__name__, transformer) From 9a07816bf1e5877e74bd59066a680995ebdc1cd8 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Sep 2020 10:30:11 -0400 Subject: [PATCH 040/100] CLN Change to get_feature_names_out --- doc/glossary.rst | 4 +- doc/modules/compose.rst | 10 ++-- doc/modules/feature_extraction.rst | 14 ++--- .../plot_topics_extraction_with_nmf_lda.py | 6 +- .../bicluster/plot_bicluster_newsgroups.py | 2 +- .../plot_column_transformer_mixed_types.py | 2 +- .../plot_feature_selection_pipeline.py | 2 +- ...linear_model_coefficient_interpretation.py | 2 +- .../inspection/plot_permutation_importance.py | 2 +- ...ot_document_classification_20newsgroups.py | 2 +- examples/text/plot_document_clustering.py | 2 +- .../text/plot_hashing_vs_dict_vectorizer.py | 2 +- sklearn/base.py | 4 +- sklearn/cluster/_agglomerative.py | 2 +- sklearn/cluster/_birch.py | 2 +- sklearn/cluster/_kmeans.py | 4 +- sklearn/compose/_column_transformer.py | 10 ++-- .../compose/tests/test_column_transformer.py | 6 +- sklearn/cross_decomposition/_pls.py | 4 +- sklearn/datasets/descr/twenty_newsgroups.rst | 2 +- sklearn/decomposition/_base.py | 2 +- sklearn/decomposition/_dict_learning.py | 4 +- sklearn/decomposition/_factor_analysis.py | 2 +- sklearn/decomposition/_fastica.py | 2 +- sklearn/decomposition/_kernel_pca.py | 2 +- sklearn/decomposition/_lda.py | 2 +- sklearn/decomposition/_nmf.py | 2 +- sklearn/decomposition/_sparse_pca.py | 2 +- sklearn/decomposition/_truncated_svd.py | 2 +- sklearn/discriminant_analysis.py | 2 +- .../feature_extraction/_dict_vectorizer.py | 8 +-- .../tests/test_dict_vectorizer.py | 6 +- sklearn/feature_extraction/tests/test_text.py | 10 ++-- sklearn/feature_extraction/text.py | 12 ++-- sklearn/feature_selection/_base.py | 2 +- sklearn/impute/_base.py | 6 +- sklearn/impute/_iterative.py | 4 +- sklearn/impute/_knn.py | 4 +- sklearn/impute/tests/test_common.py | 4 +- sklearn/impute/tests/test_impute.py | 4 +- sklearn/kernel_approximation.py | 10 ++-- sklearn/manifold/_isomap.py | 2 +- sklearn/manifold/_locally_linear.py | 2 +- sklearn/neighbors/_graph.py | 4 +- sklearn/neighbors/_nca.py | 2 +- sklearn/neural_network/_rbm.py | 2 +- sklearn/pipeline.py | 20 +++---- sklearn/preprocessing/_data.py | 8 +-- sklearn/preprocessing/_discretization.py | 2 +- sklearn/preprocessing/_encoders.py | 8 +-- sklearn/preprocessing/tests/test_data.py | 2 +- sklearn/preprocessing/tests/test_encoders.py | 8 +-- sklearn/random_projection.py | 2 +- sklearn/tests/test_common.py | 16 +++--- sklearn/tests/test_pipeline.py | 56 +++++++++---------- sklearn/utils/estimator_checks.py | 6 +- 56 files changed, 158 insertions(+), 158 deletions(-) diff --git a/doc/glossary.rst b/doc/glossary.rst index 7cf344bee80e4..80583de4e8a74 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -868,7 +868,7 @@ Class APIs and Estimator Types * :term:`fit` * :term:`transform` * :term:`get_feature_names` - * :term:`get_output_names` + * :term:`get_feature_names_out` meta-estimator meta-estimators @@ -1237,7 +1237,7 @@ Methods to the names of input columns from which output column names can be generated. By default input features are named x0, x1, .... - ``get_output_names`` + ``get_feature_names_out`` Primarily for :term:`feature extractors`, but also used for other transformers to provide string names for each column in the output of the estimator's :term:`transform` method. It outputs a list of diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index fc9415311ef37..efedba2376b06 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -139,7 +139,7 @@ or by name:: >>> pipe['reduce_dim'] PCA() -To enable model inspection, `Pipeline` has an ``get_output_names()`` method, +To enable model inspection, `Pipeline` has an ``get_feature_names_out()`` method, just like all transformers. You can use pipeline slicing to get the feature names going into each step:: @@ -151,13 +151,13 @@ going into each step:: ... ('clf', LogisticRegression())]) >>> pipe.fit(iris.data, iris.target) Pipeline(steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))]) - >>> pipe[:-1].get_output_names() + >>> pipe[:-1].get_feature_names_out() array(['x2', 'x3'], dtype='>> pipe[:-1].get_output_names(iris.feature_names) + >>> pipe[:-1].get_feature_names_out(iris.feature_names) array(['petal length (cm)', 'petal width (cm)'], dtype='>> column_trans.get_output_names() + >>> column_trans.get_feature_names_out() ['categories__city_London', 'categories__city_Paris', 'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his', 'title_bow__how', 'title_bow__last', diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst index f4900a102bbd2..0db3f9f3cdc18 100644 --- a/doc/modules/feature_extraction.rst +++ b/doc/modules/feature_extraction.rst @@ -53,7 +53,7 @@ is a traditional numerical feature:: [ 0., 1., 0., 12.], [ 0., 0., 1., 18.]]) - >>> vec.get_output_names() + >>> vec.get_feature_names_out() ['city=Dubai', 'city=London', 'city=San Francisco', 'temperature'] :class:`DictVectorizer` accepts multiple string values for one @@ -69,7 +69,7 @@ and its year of release. array([[0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 2.003e+03], [1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.011e+03], [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.974e+03]]) - >>> vec.get_output_names() == ['category=animation', 'category=drama', + >>> vec.get_feature_names_out() == ['category=animation', 'category=drama', ... 'category=family', 'category=thriller', ... 'year'] True @@ -111,7 +111,7 @@ suitable for feeding into a classifier (maybe after being piped into a with 6 stored elements in Compressed Sparse ... format> >>> pos_vectorized.toarray() array([[1., 1., 1., 1., 1., 1.]]) - >>> vec.get_output_names() + >>> vec.get_feature_names_out() ['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the'] As you can imagine, if one extracts such a context around each individual @@ -340,7 +340,7 @@ Each term found by the analyzer during the fit is assigned a unique integer index corresponding to a column in the resulting matrix. This interpretation of the columns can be retrieved as follows:: - >>> vectorizer.get_output_names() == ( + >>> vectorizer.get_feature_names_out() == ( ... ['and', 'document', 'first', 'is', 'one', ... 'second', 'the', 'third', 'this']) True @@ -742,7 +742,7 @@ decide better:: >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2)) >>> counts = ngram_vectorizer.fit_transform(['words', 'wprds']) - >>> ngram_vectorizer.get_output_names() == ( + >>> ngram_vectorizer.get_feature_names_out() == ( ... [' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp']) True >>> counts.toarray().astype(int) @@ -758,7 +758,7 @@ span across words:: >>> ngram_vectorizer.fit_transform(['jumpy fox']) <1x4 sparse matrix of type '<... 'numpy.int64'>' with 4 stored elements in Compressed Sparse ... format> - >>> ngram_vectorizer.get_output_names() == ( + >>> ngram_vectorizer.get_feature_names_out() == ( ... [' fox ', ' jump', 'jumpy', 'umpy ']) True @@ -766,7 +766,7 @@ span across words:: >>> ngram_vectorizer.fit_transform(['jumpy fox']) <1x5 sparse matrix of type '<... 'numpy.int64'>' with 5 stored elements in Compressed Sparse ... format> - >>> ngram_vectorizer.get_output_names() == ( + >>> ngram_vectorizer.get_feature_names_out() == ( ... ['jumpy', 'mpy f', 'py fo', 'umpy ', 'y fox']) True diff --git a/examples/applications/plot_topics_extraction_with_nmf_lda.py b/examples/applications/plot_topics_extraction_with_nmf_lda.py index 5d0b6ef7c629d..c677fa3b6650a 100644 --- a/examples/applications/plot_topics_extraction_with_nmf_lda.py +++ b/examples/applications/plot_topics_extraction_with_nmf_lda.py @@ -103,7 +103,7 @@ def plot_top_words(model, feature_names, n_top_words, title): print("done in %0.3fs." % (time() - t0)) -tfidf_feature_names = tfidf_vectorizer.get_output_names() +tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() plot_top_words(nmf, tfidf_feature_names, n_top_words, 'Topics in NMF model (Frobenius norm)') @@ -117,7 +117,7 @@ def plot_top_words(model, feature_names, n_top_words, title): l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) -tfidf_feature_names = tfidf_vectorizer.get_output_names() +tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() plot_top_words(nmf, tfidf_feature_names, n_top_words, 'Topics in NMF model (generalized Kullback-Leibler divergence)') @@ -132,5 +132,5 @@ def plot_top_words(model, feature_names, n_top_words, title): lda.fit(tf) print("done in %0.3fs." % (time() - t0)) -tf_feature_names = tf_vectorizer.get_output_names() +tf_feature_names = tf_vectorizer.get_feature_names_out() plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model') diff --git a/examples/bicluster/plot_bicluster_newsgroups.py b/examples/bicluster/plot_bicluster_newsgroups.py index e4c019ff2c84e..c01807e345928 100644 --- a/examples/bicluster/plot_bicluster_newsgroups.py +++ b/examples/bicluster/plot_bicluster_newsgroups.py @@ -89,7 +89,7 @@ def build_tokenizer(self): time() - start_time, v_measure_score(y_kmeans, y_true))) -feature_names = vectorizer.get_output_names() +feature_names = vectorizer.get_feature_names_out() document_names = list(newsgroups.target_names[i] for i in newsgroups.target) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index ba2de8cd551db..56f57bd830440 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -186,7 +186,7 @@ cv_coefs = np.concatenate([cv_pipeline[-1].coef_ for cv_pipeline in cv_results["estimator"]]) fig, ax = plt.subplots() -ax.barh(clf[:-1].get_output_names(), +ax.barh(clf[:-1].get_feature_names_out(), cv_coefs.mean(axis=0), xerr=cv_coefs.std(axis=0)) plt.tight_layout() plt.show() diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py index 69e5196ff6ba3..44cdb1365ed72 100644 --- a/examples/feature_selection/plot_feature_selection_pipeline.py +++ b/examples/feature_selection/plot_feature_selection_pipeline.py @@ -39,5 +39,5 @@ # access and plot the coefficients of the fitted model plt.barh((0, 1, 2), anova_svm[-1].coef_.ravel()) -plt.yticks((0, 1, 2), anova_svm[:-1].get_output_names()) +plt.yticks((0, 1, 2), anova_svm[:-1].get_feature_names_out()) plt.show() diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 72cca2bff0b43..d859d48ddc8a9 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -208,7 +208,7 @@ feature_names = (model.named_steps['columntransformer'] .named_transformers_['onehotencoder'] - .get_output_names(input_features=categorical_columns)) + .get_feature_names_out(input_features=categorical_columns)) feature_names = np.concatenate( [feature_names, numerical_columns]) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 6a0f8a6ba995d..b04d82ef82bdf 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -124,7 +124,7 @@ ohe = (rf.named_steps['preprocess'] .named_transformers_['cat'] .named_steps['onehot']) -feature_names = ohe.get_output_names(input_features=categorical_columns) +feature_names = ohe.get_feature_names_out(input_features=categorical_columns) feature_names = np.r_[feature_names, numerical_columns] tree_feature_importances = ( diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py index 7ce06d47c730c..0b39ee49146c1 100644 --- a/examples/text/plot_document_classification_20newsgroups.py +++ b/examples/text/plot_document_classification_20newsgroups.py @@ -174,7 +174,7 @@ def size_mb(docs): if opts.use_hashing: feature_names = None else: - feature_names = vectorizer.get_output_names() + feature_names = vectorizer.get_feature_names_out() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py index 12e28b9b09de7..128138681bc72 100644 --- a/examples/text/plot_document_clustering.py +++ b/examples/text/plot_document_clustering.py @@ -217,7 +217,7 @@ def is_interactive(): else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] - terms = vectorizer.get_output_names() + terms = vectorizer.get_feature_names_out() for i in range(true_k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: diff --git a/examples/text/plot_hashing_vs_dict_vectorizer.py b/examples/text/plot_hashing_vs_dict_vectorizer.py index 964f500e8bef8..1cf5c0aa6a0ce 100644 --- a/examples/text/plot_hashing_vs_dict_vectorizer.py +++ b/examples/text/plot_hashing_vs_dict_vectorizer.py @@ -89,7 +89,7 @@ def token_freqs(doc): vectorizer.fit_transform(token_freqs(d) for d in raw_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) -print("Found %d unique terms" % len(vectorizer.get_output_names())) +print("Found %d unique terms" % len(vectorizer.get_feature_names_out())) print() print("FeatureHasher on frequency dicts") diff --git a/sklearn/base.py b/sklearn/base.py index 55a11e3958571..7bf1231371749 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -749,13 +749,13 @@ def fit_predict(self, X, y=None): class OneToOneMixin: - """Provides get_output_names for simple transformers + """Provides get_feature_names_out for simple transformers Assumes there's a 1-to-1 correspondence between input features and output features. """ - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Returns input_features as this transformation diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index a0bd1724b0768..d884c826537d4 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -946,7 +946,7 @@ def fit_predict(self, X, y=None): """ return super().fit_predict(X, y) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index fda06a87b59cd..7863b1c71b64b 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -657,7 +657,7 @@ def _global_clustering(self, X=None): if compute_labels: self.labels_ = self.predict(X) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index f95f769488805..087c4e4ac77da 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1159,7 +1159,7 @@ def score(self, X, y=None, sample_weight=None): return -_labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[1] - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters @@ -1897,7 +1897,7 @@ def predict(self, X, sample_weight=None): X = self._check_test_data(X) return self._labels_inertia_minibatch(X, sample_weight)[0] - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 073526a579862..4191fe3461a8c 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -351,7 +351,7 @@ def named_transformers_(self): in self.transformers_}) @deprecated("get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_output_names instead") + "in 0.26. You can use get_feature_names_out instead") def get_feature_names(self): """Get feature names from all transformers. @@ -385,7 +385,7 @@ def get_feature_names(self): trans.get_feature_names()]) return feature_names - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters @@ -415,13 +415,13 @@ def get_output_names(self, input_features=None): indices = np.arange(self._n_features) feature_names.extend(['x%d' % i for i in indices[column]]) continue - if not hasattr(trans, 'get_output_names'): + if not hasattr(trans, 'get_feature_names_out'): raise AttributeError("Transformer %s (type %s) does not " - "provide get_output_names." + "provide get_feature_names_out." % (str(name), type(trans).__name__)) feature_names.extend( [name + "__" + f - for f in trans.get_output_names(input_features=column)]) + for f in trans.get_feature_names_out(input_features=column)]) return feature_names def _update_fitted_transformers(self, transformers): diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index becfa84749820..236c9fe277ef9 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -665,7 +665,7 @@ def test_column_transformer_cloning(): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_column_transformer_get_feature_names(get_names): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans', Trans(), [0, 1])]) @@ -737,7 +737,7 @@ def test_column_transformer_get_feature_names(get_names): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_column_transformer_get_feature_names_dataframe(get_names): # passthough transformer with a dataframe pd = pytest.importorskip('pandas') @@ -1422,7 +1422,7 @@ def test_output_names_empty_columns(empty_col): ) ct.fit(df) - assert ct.get_output_names() == ['ohe__col1_a', 'ohe__col1_b', + assert ct.get_feature_names_out() == ['ohe__col1_a', 'ohe__col1_b', 'ohe__col2_z'] diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 6379bbf4604f7..77df1e13420ec 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -442,7 +442,7 @@ def _more_tags(self): return {'poor_score': True, 'requires_y': False} - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters @@ -974,7 +974,7 @@ def fit_transform(self, X, y=None): """ return self.fit(X, y).transform(X, y) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/datasets/descr/twenty_newsgroups.rst b/sklearn/datasets/descr/twenty_newsgroups.rst index 40f5d319acc63..eb3e11e1b6cbc 100644 --- a/sklearn/datasets/descr/twenty_newsgroups.rst +++ b/sklearn/datasets/descr/twenty_newsgroups.rst @@ -156,7 +156,7 @@ Let's take a look at what the most informative features are: >>> import numpy as np >>> def show_top10(classifier, vectorizer, categories): - ... feature_names = np.asarray(vectorizer.get_output_names()) + ... feature_names = np.asarray(vectorizer.get_feature_names_out()) ... for i, category in enumerate(categories): ... top10 = np.argsort(classifier.coef_[i])[-10:] ... print("%s: %s" % (category, " ".join(feature_names[top10]))) diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index 11d55e90ba153..f23d2c08a87d8 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -159,7 +159,7 @@ def inverse_transform(self, X): else: return np.dot(X, self.components_) + self.mean_ - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 5440a7265b74f..0e22e8afb2008 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1358,7 +1358,7 @@ def fit(self, X, y=None): self.error_ = E return self - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters @@ -1666,7 +1666,7 @@ def partial_fit(self, X, y=None, iter_offset=None): self.iter_offset_ = iter_offset + 1 return self - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index fceeec83ea6c9..6769f4b076e04 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -389,7 +389,7 @@ def _rotate(self, components, n_components=None, tol=1e-6): raise ValueError("'method' must be in %s, not %s" % (implemented, method)) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 0105668ad942b..6acd7923bd7bb 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -623,7 +623,7 @@ def inverse_transform(self, X, copy=True): return X - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index ae1bb43727ffe..4866b063986bb 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -365,7 +365,7 @@ def inverse_transform(self, X): K.flat[::n_samples + 1] += self.alpha return np.dot(K, self.dual_coef_) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index d4d7d984acf68..feab5aaa7a1da 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -840,7 +840,7 @@ def perplexity(self, X, sub_sampling=False): """ return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 3ec2f495927ab..816e4682e43e9 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1378,7 +1378,7 @@ def inverse_transform(self, W): check_is_fitted(self) return np.dot(W, self.components_) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 1498fab271cf1..3bce8e953175e 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -206,7 +206,7 @@ def transform(self, X): return U - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index f8e549c382c61..bd2b89261d7d6 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -237,7 +237,7 @@ def inverse_transform(self, X): def _more_tags(self): return {'preserves_dtype': [np.float64, np.float32]} - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 85e93d3148b1d..9c4a0ffe7ec09 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -657,7 +657,7 @@ def decision_function(self, X): # Only override for the doc return super().decision_function(X) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 4ac0df4e1325d..10d158db0d125 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -370,7 +370,7 @@ def transform(self, X): return Xa @deprecated("get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_output_names instead") + "in 0.26. You can use get_feature_names_out instead") def get_feature_names(self): """Returns a list of feature names, ordered by their indices. @@ -379,7 +379,7 @@ def get_feature_names(self): """ return self.feature_names_ - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters @@ -419,11 +419,11 @@ def restrict(self, support, indices=False): >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] >>> X = v.fit_transform(D) >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1]) - >>> v.get_output_names() + >>> v.get_feature_names_out() ['bar', 'baz', 'foo'] >>> v.restrict(support.get_support()) DictVectorizer() - >>> v.get_output_names() + >>> v.get_feature_names_out() ['bar', 'foo'] """ if not indices: diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 2cf6994bade54..29b3d32dd2762 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -46,7 +46,7 @@ def test_dictvectorizer(sparse, dtype, sort, iterable): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_feature_selection(get_names): # make two feature dicts with two useful features and a bunch of useless # ones, in terms of chi2 @@ -67,7 +67,7 @@ def test_feature_selection(get_names): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_one_of_k(get_names): D_in = [{"version": "1", "ham": 2}, {"version": "2", "spam": .3}, @@ -87,7 +87,7 @@ def test_one_of_k(get_names): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_iterable_value(get_names): D_names = ['ham', 'spam', 'version=1', 'version=2', 'version=3'] X_expected = [[2.0, 0.0, 2.0, 1.0, 0.0], diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 6ecb6fffb3ffd..1834f4d27acae 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -563,7 +563,7 @@ def test_hashing_vectorizer(): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_feature_names(get_names): cv = CountVectorizer(max_df=0.5) @@ -616,7 +616,7 @@ def test_vectorizer_max_features(Vectorizer): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_count_vectorizer_max_features(get_names): # Regression test: max_features didn't work correctly in 0.14. @@ -692,7 +692,7 @@ def test_vectorizer_min_df(): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_count_binary_occurrences(get_names): # by default multiple occurrences are counted as longs test_data = ['aaabc', 'abbde'] @@ -936,7 +936,7 @@ def test_pickling_built_processors(factory): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_countvectorizer_vocab_sets_when_pickling(get_names): # ensure that vocabulary of type set is coerced to a list to # preserve iteration ordering after deserialization @@ -955,7 +955,7 @@ def test_countvectorizer_vocab_sets_when_pickling(get_names): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_countvectorizer_vocab_dicts_when_pickling(get_names): rng = np.random.RandomState(0) vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza', diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 74a3c3c2749a2..5b370f772a2d8 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -962,7 +962,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): ... ] >>> vectorizer = CountVectorizer() >>> X = vectorizer.fit_transform(corpus) - >>> print(vectorizer.get_output_names()) + >>> print(vectorizer.get_feature_names_out()) ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] >>> print(X.toarray()) [[0 1 1 1 0 0 1 0 1] @@ -971,7 +971,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): [0 1 1 1 0 0 1 0 1]] >>> vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2)) >>> X2 = vectorizer2.fit_transform(corpus) - >>> print(vectorizer2.get_output_names()) + >>> print(vectorizer2.get_feature_names_out()) ['and this', 'document is', 'first document', 'is the', 'is this', 'second document', 'the first', 'the second', 'the third', 'third one', 'this document', 'this is', 'this the'] @@ -1275,7 +1275,7 @@ def inverse_transform(self, X): for i in range(n_samples)] @deprecated("get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_output_names instead") + "in 0.26. You can use get_feature_names_out instead") def get_feature_names(self): """Array mapping from feature integer indices to feature name. @@ -1284,9 +1284,9 @@ def get_feature_names(self): feature_names : list A list of feature names. """ - return self.get_output_names() + return self.get_feature_names_out() - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters @@ -1720,7 +1720,7 @@ class TfidfVectorizer(CountVectorizer): ... ] >>> vectorizer = TfidfVectorizer() >>> X = vectorizer.fit_transform(corpus) - >>> print(vectorizer.get_output_names()) + >>> print(vectorizer.get_feature_names_out()) ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] >>> print(X.shape) (4, 9) diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index c0bb1dba61f8a..bc8d8c82a4f61 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -126,7 +126,7 @@ def inverse_transform(self, X): Xt[:, support] = X return Xt - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Mask feature names according to selected features. Parameters diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 629237da6cb81..3c2744f176b60 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -494,7 +494,7 @@ def transform(self, X): def _more_tags(self): return {'allow_nan': True} - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters @@ -513,7 +513,7 @@ def get_output_names(self, input_features=None): output = np.array(input_features)[self._valid_mask].tolist() if not self.add_indicator: return output - missing_names = self.indicator_.get_output_names(input_features) + missing_names = self.indicator_.get_feature_names_out(input_features) missing_names = [f'missingindicator__{name}' for name in missing_names] return output + missing_names @@ -877,7 +877,7 @@ def _more_tags(self): "preserves_dtype": [], } - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 21e81b5932804..8e5ea6572c27a 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -748,7 +748,7 @@ def fit(self, X, y=None): self.fit_transform(X) return self - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters @@ -769,7 +769,7 @@ def get_output_names(self, input_features=None): .tolist()) if not self.add_indicator: return output - missing_names = self.indicator_.get_output_names(input_features) + missing_names = self.indicator_.get_feature_names_out(input_features) missing_names = [f'missingindicator__{name}' for name in missing_names] return output + missing_names diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index f5587701d972f..be8686a4d15c7 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -306,7 +306,7 @@ def process_chunk(dist_chunk, start): return super()._concatenate_indicator(X[:, valid_mask], X_indicator) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters @@ -325,7 +325,7 @@ def get_output_names(self, input_features=None): output = np.array(input_features)[self._valid_mask].tolist() if not self.add_indicator: return output - missing_names = self.indicator_.get_output_names(input_features) + missing_names = self.indicator_.get_feature_names_out(input_features) missing_names = [f'missingindicator__{name}' for name in missing_names] return output + missing_names diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py index 06bc4938e5a61..00a8afd62d1ec 100644 --- a/sklearn/impute/tests/test_common.py +++ b/sklearn/impute/tests/test_common.py @@ -130,7 +130,7 @@ def test_imputers_marker(marker, imputer): feature_names_in = [f'feat{i}' for i in range(X.shape[1])] expected_features_out = ['feat0', 'feat1', 'feat2', 'feat4'] imputer.fit(X) - assert_array_equal(imputer.get_output_names(feature_names_in), + assert_array_equal(imputer.get_feature_names_out(feature_names_in), expected_features_out) imputer.set_params(missing_values=marker, add_indicator=True) @@ -139,5 +139,5 @@ def test_imputers_marker(marker, imputer): [f'missingindicator__feat{i}' for i in [0, 1, 2, 3]]) imputer.fit(X) - assert_array_equal(imputer.get_output_names(feature_names_in), + assert_array_equal(imputer.get_feature_names_out(feature_names_in), expected_features_out) diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py index d25abc481ad5b..dc9c0e17e5fa6 100644 --- a/sklearn/impute/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -1480,10 +1480,10 @@ def test_simple_imputation_inverse_transform_exceptions(missing_value): "features, expected_names", [("missing-only", ["feat0", "feat2"]), ("all", ["feat0", "feat1", "feat2"])]) -def test_missing_indicator_get_output_names(features, expected_names): +def test_missing_indicator_get_feature_names_out(features, expected_names): # output names are correct for missing indicator X = np.array([[1, 0, np.nan], [np.nan, 1, 1]]) indicator = MissingIndicator(features=features).fit(X) - assert_array_equal(indicator.get_output_names(["feat0", "feat1", "feat2"]), + assert_array_equal(indicator.get_feature_names_out(["feat0", "feat1", "feat2"]), expected_names) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 5db7617ff5338..614bea9f9b54f 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -194,7 +194,7 @@ def transform(self, X): return data_sketch - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters @@ -326,7 +326,7 @@ def transform(self, X): projection *= np.sqrt(2.) / np.sqrt(self.n_components) return projection - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters @@ -466,7 +466,7 @@ def transform(self, X): projection *= np.sqrt(2.) / np.sqrt(self.n_components) return projection - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters @@ -668,7 +668,7 @@ def _transform_sparse(self, X): return sp.hstack(X_new) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters @@ -879,7 +879,7 @@ def _get_kernel_params(self): return params - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index 34e634cb8d0b3..df517fe2f3753 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -272,7 +272,7 @@ def transform(self, X): return self.kernel_pca_.transform(G_X) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index 13cffa3b75289..e57f1db72b0a8 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -735,7 +735,7 @@ def transform(self, X): X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i]) return X_new - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index 287a8c7304d48..e553dbf100d42 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -371,7 +371,7 @@ def fit_transform(self, X, y=None): """ return self.fit(X).transform(X) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters @@ -562,7 +562,7 @@ def fit_transform(self, X, y=None): """ return self.fit(X).transform(X) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index e0b73594d59a4..3e6d7f6fe8a00 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -524,7 +524,7 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): return sign * loss, sign * gradient.ravel() - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index e3a4cbc774776..07bcd5427db74 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -387,7 +387,7 @@ def _more_tags(self): } } - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index e4ec39a9326f8..1cc5a5f3baba4 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -625,7 +625,7 @@ def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Transform input features using the pipeline. @@ -644,12 +644,12 @@ def get_output_names(self, input_features=None): """ feature_names = input_features for _, name, transform in self._iter(): - if not hasattr(transform, "get_output_names"): + if not hasattr(transform, "get_feature_names_out"): raise TypeError( - "Estimator {} does provide get_output_names. " - "Did you mean to call Pipeline[:-1].get_output_names" + "Estimator {} does provide get_feature_names_out. " + "Did you mean to call Pipeline[:-1].get_feature_names_out" "()?".format(name)) - feature_names = transform.get_output_names( + feature_names = transform.get_feature_names_out( input_features=feature_names) return feature_names @@ -933,7 +933,7 @@ def _iter(self): if trans != 'drop') @deprecated("get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_output_names instead") + "in 0.26. You can use get_feature_names_out instead") def get_feature_names(self): """Get feature names from all transformers. @@ -952,7 +952,7 @@ def get_feature_names(self): trans.get_feature_names()]) return feature_names - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters @@ -967,13 +967,13 @@ def get_output_names(self, input_features=None): """ feature_names = [] for name, trans, _ in self._iter(): - if not hasattr(trans, 'get_output_names'): + if not hasattr(trans, 'get_feature_names_out'): raise AttributeError("Transformer %s (type %s) does not " - "provide get_output_names." + "provide get_feature_names_out." % (str(name), type(trans).__name__)) feature_names.extend( [name + "__" + f for f in - trans.get_output_names(input_features)]) + trans.get_feature_names_out(input_features)]) return feature_names def fit(self, X, y=None, **fit_params): diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 5f8ecd06071d9..bc66f3bcb569c 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -1619,7 +1619,7 @@ def powers_(self): for c in combinations]) @deprecated("get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_output_names instead") + "in 0.26. You can use get_feature_names_out instead") def get_feature_names(self, input_features=None): """ Return feature names for output features @@ -1634,9 +1634,9 @@ def get_feature_names(self, input_features=None): ------- output_feature_names : list of str of shape (n_output_features,) """ - return self.get_output_names(input_features) + return self.get_feature_names_out(input_features) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Parameters @@ -2290,7 +2290,7 @@ def transform(self, K, copy=True): return K - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index f0e63e902ceb4..f6138161342cc 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -357,7 +357,7 @@ def inverse_transform(self, Xt): return Xinv - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index b1c692dd1951a..8c95d57d364b5 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -275,7 +275,7 @@ class OneHotEncoder(_BaseEncoder): >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]]) array([['Male', 1], [None, 2]], dtype=object) - >>> enc.get_output_names(['gender', 'group']) + >>> enc.get_feature_names_out(['gender', 'group']) ['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'] One can always drop the first column for each feature: @@ -559,7 +559,7 @@ def inverse_transform(self, X): return X_tr @deprecated("get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_output_names instead") + "in 0.26. You can use get_feature_names_out instead") def get_feature_names(self, input_features=None): """ Return feature names for output features. @@ -575,10 +575,10 @@ def get_feature_names(self, input_features=None): output_feature_names : ndarray of shape (n_output_features,) Array of feature names. """ - feature_names = self.get_output_names(input_features) + feature_names = self.get_feature_names_out(input_features) return np.array(feature_names, dtype=object) - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Returns input_features as this transformation diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index ebbdbbcb56569..0ace3fc343d50 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -128,7 +128,7 @@ def test_polynomial_features(): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_polynomial_feature_names(get_names): X = np.arange(30).reshape(10, 3) poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index f1bc6f10a9522..61e12995afbac 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -135,7 +135,7 @@ def test_one_hot_encoder_dtype_pandas(output_dtype): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_one_hot_encoder_feature_names(get_names): enc = OneHotEncoder() X = [['Male', 1, 'girl', 2, 3], @@ -172,7 +172,7 @@ def test_one_hot_encoder_feature_names(get_names): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_one_hot_encoder_feature_names_unicode(get_names): enc = OneHotEncoder() X = np.array([['c❤t1', 'dat2']], dtype=object).T @@ -282,7 +282,7 @@ def test_one_hot_encoder_inverse_if_binary(): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) @pytest.mark.parametrize('drop', ['if_binary', 'first', None]) @pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None]) def test_one_hot_encoder_drop_reset(get_names, drop, reset_drop): @@ -420,7 +420,7 @@ def test_one_hot_encoder_pandas(): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) @pytest.mark.parametrize("drop, expected_names", [('first', ['x0_c', 'x2_b']), ('if_binary', ['x0_c', 'x1_2', 'x2_b']), diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 36da47c93b85d..15ab125954ddd 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -417,7 +417,7 @@ def transform(self, X): dense_output=self.dense_output) return X_new - def get_output_names(self, input_features=None): + def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index e8a5589042033..fb813938ddbbd 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -38,7 +38,7 @@ _get_check_estimator_ids, check_class_weight_balanced_linear_classifier, parametrize_with_checks, - check_transformer_get_output_names) + check_transformer_get_feature_names_out) def test_all_estimator_no_base_class(): @@ -271,9 +271,9 @@ def test_strict_mode_parametrize_with_checks(estimator, check): check(estimator) -# TODO: As more modules support get_output_names they should be removed from +# TODO: As more modules support get_feature_names_out they should be removed from # this list to be tested -GET_OUTPUT_NAMES_MODULES_TO_IGNORE = [ +get_feature_names_out_MODULES_TO_IGNORE = [ 'cluster', 'compose', 'cross_decomposition', @@ -293,15 +293,15 @@ def test_strict_mode_parametrize_with_checks(estimator, check): 'random_projection' ] -GET_OUTPUT_NAMES_ESTIMATORS = [ +get_feature_names_out_ESTIMATORS = [ est for est in _tested_estimators('transformer') if "2darray" in est._get_tags()["X_types"] and not est._get_tags()["no_validation"] and - est.__module__.split('.')[1] not in GET_OUTPUT_NAMES_MODULES_TO_IGNORE + est.__module__.split('.')[1] not in get_feature_names_out_MODULES_TO_IGNORE ] -@pytest.mark.parametrize("transformer", GET_OUTPUT_NAMES_ESTIMATORS) -def test_transformers_get_output_names(transformer): - check_transformer_get_output_names(type(transformer).__name__, +@pytest.mark.parametrize("transformer", get_feature_names_out_ESTIMATORS) +def test_transformers_get_feature_names_out(transformer): + check_transformer_get_feature_names_out(type(transformer).__name__, transformer) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 83fbc25dbe8c0..4715d90f6ff0e 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -861,7 +861,7 @@ def test_feature_union_parallel(): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_feature_union_feature_names(get_names): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) @@ -895,7 +895,7 @@ def test_classes_property(): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_set_feature_union_steps(get_names): mult2 = Mult(2) mult3 = Mult(3) @@ -905,10 +905,10 @@ def test_set_feature_union_steps(get_names): mult3.get_feature_names = lambda: ['x3'] mult2.get_feature_names = lambda: ['x2'] mult5.get_feature_names = lambda: ['x5'] - else: # get_output_names - mult3.get_output_names = lambda input_features: ['x3'] - mult2.get_output_names = lambda input_features: ['x2'] - mult5.get_output_names = lambda input_features: ['x5'] + else: # get_feature_names_out + mult3.get_feature_names_out = lambda input_features: ['x3'] + mult2.get_feature_names_out = lambda input_features: ['x2'] + mult5.get_feature_names_out = lambda input_features: ['x5'] ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) @@ -933,7 +933,7 @@ def test_set_feature_union_steps(get_names): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", - "get_output_names"]) + "get_feature_names_out"]) def test_set_feature_union_step_drop(get_names): mult2 = Mult(2) mult3 = Mult(3) @@ -941,9 +941,9 @@ def test_set_feature_union_step_drop(get_names): if get_names == "get_feature_names": mult2.get_feature_names = lambda: ['x2'] mult3.get_feature_names = lambda: ['x3'] - else: # get_output_names - mult2.get_output_names = lambda input_features: ['x2'] - mult3.get_output_names = lambda input_features: ['x3'] + else: # get_feature_names_out + mult2.get_feature_names_out = lambda input_features: ['x2'] + mult3.get_feature_names_out = lambda input_features: ['x3'] X = np.asarray([[1]]) @@ -1148,20 +1148,20 @@ def test_feature_names_basic(): ('select', SelectKBest(k=2)), ('clf', LogisticRegression())]) with pytest.raises(NotFittedError): - pipe.get_output_names() + pipe.get_feature_names_out() iris = load_iris() pipe.fit(iris.data, iris.target) xs = np.array(['x0', 'x1', 'x2', 'x3']) - assert_array_equal(pipe[:1].get_output_names(), xs) + assert_array_equal(pipe[:1].get_feature_names_out(), xs) mask = pipe.named_steps.select.get_support() - assert_array_equal(pipe[:-1].get_output_names(), xs[mask]) + assert_array_equal(pipe[:-1].get_feature_names_out(), xs[mask]) with pytest.raises( TypeError, - match="Estimator clf does provide get_output_names."): - pipe.get_output_names(iris.feature_names) - assert_array_equal(pipe[:1].get_output_names(iris.feature_names), + match="Estimator clf does provide get_feature_names_out."): + pipe.get_feature_names_out(iris.feature_names) + assert_array_equal(pipe[:1].get_feature_names_out(iris.feature_names), iris.feature_names) - assert_array_equal(pipe[:-1].get_output_names(iris.feature_names), + assert_array_equal(pipe[:-1].get_feature_names_out(iris.feature_names), np.array(iris.feature_names)[mask]) pipe = Pipeline(steps=[ ('scaler', StandardScaler()), @@ -1169,9 +1169,9 @@ def test_feature_names_basic(): ('select', SelectKBest(k=2)), ('clf', LogisticRegression())]) pipe.fit(iris.data, iris.target) - assert_array_equal(pipe[:-1].get_output_names(), ['pca0', 'pca1']) + assert_array_equal(pipe[:-1].get_feature_names_out(), ['pca0', 'pca1']) # setting names doesn't change names after PCA - assert_array_equal(pipe[:-2].get_output_names(iris.feature_names), + assert_array_equal(pipe[:-2].get_feature_names_out(iris.feature_names), ['pca0', 'pca1', 'pca2']) @@ -1187,7 +1187,7 @@ def test_input_feature_names_pandas(): pipe.fit(df, iris.target) mask = pipe.named_steps.select.get_support() # for now assuming we have to pass these explicitly - assert_array_equal(pipe[:-1].get_output_names(iris.feature_names), + assert_array_equal(pipe[:-1].get_feature_names_out(iris.feature_names), np.array(iris.feature_names)[mask]) @@ -1200,8 +1200,8 @@ def test_features_names_passthrough(): iris = load_iris() pipe.fit(iris.data, iris.target) xs = ['x0', 'x1', 'x2', 'x3'] - assert_array_equal(pipe[:-1].get_output_names(), xs) - assert_array_equal(pipe[:-1].get_output_names(iris.feature_names), + assert_array_equal(pipe[:-1].get_feature_names_out(), xs) + assert_array_equal(pipe[:-1].get_feature_names_out(iris.feature_names), iris.feature_names) @@ -1211,9 +1211,9 @@ def test_feature_names_count_vectorizer(): ('clf', LogisticRegression())]) y = ["pizza" in x for x in JUNK_FOOD_DOCS] pipe.fit(JUNK_FOOD_DOCS, y) - assert_array_equal(pipe[:-1].get_output_names(), + assert_array_equal(pipe[:-1].get_feature_names_out(), ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) - assert_array_equal(pipe[:-1].get_output_names("nonsense_is_ignored"), + assert_array_equal(pipe[:-1].get_feature_names_out("nonsense_is_ignored"), ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) @@ -1226,9 +1226,9 @@ def test_feature_names_nested(): xs = np.array(['x0', 'x1', 'x2', 'x3']) mask = pipe.named_steps.inner_pipe.named_steps.select.get_support() assert_array_equal( - pipe.named_steps.inner_pipe[:1].get_output_names(), xs[mask]) + pipe.named_steps.inner_pipe[:1].get_feature_names_out(), xs[mask]) assert_array_equal( - pipe.named_steps.inner_pipe[:1].get_output_names(iris.feature_names), + pipe.named_steps.inner_pipe[:1].get_feature_names_out(iris.feature_names), np.array(iris.feature_names)[mask]) @@ -1242,8 +1242,8 @@ def test_feature_names_meta_pipe(): # check 0ths estimator in OVR only inner_pipe = pipe['ovr'].estimators_[0] mask = inner_pipe['select'].get_support() - assert_array_equal(inner_pipe[:-1].get_output_names(), xs[mask]) - assert_array_equal(inner_pipe[:-1].get_output_names(iris.feature_names), + assert_array_equal(inner_pipe[:-1].get_feature_names_out(), xs[mask]) + assert_array_equal(inner_pipe[:-1].get_feature_names_out(iris.feature_names), np.array(iris.feature_names)[mask]) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 222554d4053dd..77f9fbc66a4ba 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3144,8 +3144,8 @@ def check_requires_y_none(name, estimator_orig, strict_mode=True): ]) -def check_transformer_get_output_names(name, transformer_orig, - strict_mode=True): +def check_transformer_get_feature_names_out(name, transformer_orig, + strict_mode=True): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) @@ -3165,7 +3165,7 @@ def check_transformer_get_output_names(name, transformer_orig, X_pred = transformer.fit_transform(X, y=y_) input_features = ['feature%d' % i for i in range(n_features)] - feature_names = transformer.get_output_names(input_features) + feature_names = transformer.get_feature_names_out(input_features) assert feature_names is not None if isinstance(X_pred, tuple): assert len(feature_names) == X_pred[0].shape[1], ( From 0d3bc4efdf6c0d016aa3380eb3e844de8ee1ab36 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Sep 2020 11:15:42 -0400 Subject: [PATCH 041/100] CLN Reduces the number of diffs --- ...linear_model_coefficient_interpretation.py | 7 +- sklearn/cluster/_agglomerative.py | 17 ---- sklearn/cluster/_birch.py | 18 ----- sklearn/cluster/_kmeans.py | 35 -------- sklearn/cross_decomposition/_pls.py | 33 -------- sklearn/decomposition/_base.py | 17 ---- sklearn/decomposition/_factor_analysis.py | 17 ---- sklearn/decomposition/_fastica.py | 17 ---- sklearn/decomposition/_kernel_pca.py | 17 ---- sklearn/decomposition/_lda.py | 17 ---- sklearn/decomposition/_nmf.py | 17 ---- sklearn/decomposition/_sparse_pca.py | 17 ---- sklearn/decomposition/_truncated_svd.py | 17 ---- sklearn/discriminant_analysis.py | 19 ----- sklearn/feature_selection/_base.py | 20 ----- sklearn/impute/_base.py | 42 ---------- sklearn/impute/_iterative.py | 27 ------- sklearn/impute/_knn.py | 25 ------ sklearn/kernel_approximation.py | 81 ------------------- sklearn/manifold/_isomap.py | 18 ----- sklearn/manifold/_locally_linear.py | 17 ---- sklearn/neighbors/_graph.py | 35 -------- sklearn/neighbors/_nca.py | 17 ---- sklearn/neural_network/_rbm.py | 17 ---- sklearn/random_projection.py | 17 ---- sklearn/tests/test_common.py | 6 +- 26 files changed, 5 insertions(+), 582 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index d859d48ddc8a9..0c86ab7d9ec72 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -206,9 +206,10 @@ # First of all, we can take a look to the values of the coefficients of the # regressor we have fitted. -feature_names = (model.named_steps['columntransformer'] - .named_transformers_['onehotencoder'] - .get_feature_names_out(input_features=categorical_columns)) +feature_names = (model + .named_steps['columntransformer'] + .named_transformers_['onehotencoder'] + .get_feature_names_out(input_features=categorical_columns)) feature_names = np.concatenate( [feature_names, numerical_columns]) diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index d884c826537d4..66342797e33b5 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -20,7 +20,6 @@ from ..neighbors._dist_metrics import METRIC_MAPPING from ..utils import check_array from ..utils._fast_dict import IntFloatDict -from ..utils._feature_names import _make_feature_names from ..utils.fixes import _astype_copy_false from ..utils.validation import _deprecate_positional_args, check_memory # mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast' @@ -946,22 +945,6 @@ def fit_predict(self, X, y=None): """ return super().fit_predict(X, y) - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.n_clusters, - prefix=type(self).__name__.lower()) - class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): """Agglomerate features. diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 7863b1c71b64b..f90c47953f9e9 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -15,7 +15,6 @@ from ..utils import check_array from ..utils.extmath import row_norms from ..utils.validation import check_is_fitted, _deprecate_positional_args -from ..utils._feature_names import _make_feature_names from ..exceptions import ConvergenceWarning from . import AgglomerativeClustering @@ -656,20 +655,3 @@ def _global_clustering(self, X=None): if compute_labels: self.labels_ = self.predict(X) - - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names( - n_features=self.subcluster_centers_.shape[0], - prefix=type(self).__name__.lower()) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 087c4e4ac77da..69901236d73b8 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -29,7 +29,6 @@ from ..utils import deprecated from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils._openmp_helpers import _openmp_effective_n_threads -from ..utils._feature_names import _make_feature_names from ..exceptions import ConvergenceWarning from ._k_means_fast import _inertia_dense from ._k_means_fast import _inertia_sparse @@ -1159,23 +1158,6 @@ def score(self, X, y=None, sample_weight=None): return -_labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[1] - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names( - n_features=self.n_clusters, - prefix=type(self).__name__.lower()) - def _more_tags(self): return { '_xfail_checks': { @@ -1897,23 +1879,6 @@ def predict(self, X, sample_weight=None): X = self._check_test_data(X) return self._labels_inertia_minibatch(X, sample_weight)[0] - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names( - n_features=self.n_clusters, - prefix=type(self).__name__.lower()) - def _more_tags(self): return { '_xfail_checks': { diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 77df1e13420ec..9d8df42bf1a46 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -20,7 +20,6 @@ from ..utils.validation import _deprecate_positional_args from ..exceptions import ConvergenceWarning from ..utils.deprecation import deprecated -from ..utils._feature_names import _make_feature_names __all__ = ['PLSCanonical', 'PLSRegression', 'PLSSVD'] @@ -442,22 +441,6 @@ def _more_tags(self): return {'poor_score': True, 'requires_y': False} - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.n_components, - prefix=type(self).__name__.lower()) - class PLSRegression(_PLS): """PLS regression @@ -973,19 +956,3 @@ def fit_transform(self, X, y=None): `(X_transformed, Y_transformed)` otherwise. """ return self.fit(X, y).transform(X, y) - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.n_components, - prefix=type(self).__name__.lower()) diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index f23d2c08a87d8..e89a05051404b 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -14,7 +14,6 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.validation import check_is_fitted -from ..utils._feature_names import _make_feature_names from abc import ABCMeta, abstractmethod @@ -158,19 +157,3 @@ def inverse_transform(self, X): self.components_) + self.mean_ else: return np.dot(X, self.components_) + self.mean_ - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.components_.shape[0], - prefix=type(self).__name__.lower()) diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index 6769f4b076e04..76fecbe31598e 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -29,7 +29,6 @@ from ..utils import check_array, check_random_state from ..utils.extmath import fast_logdet, randomized_svd, squared_norm from ..utils.validation import check_is_fitted, _deprecate_positional_args -from ..utils._feature_names import _make_feature_names from ..exceptions import ConvergenceWarning @@ -389,22 +388,6 @@ def _rotate(self, components, n_components=None, tol=1e-6): raise ValueError("'method' must be in %s, not %s" % (implemented, method)) - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.components_.shape[0], - prefix=type(self).__name__.lower()) - def _ortho_rotation(components, method='varimax', tol=1e-6, max_iter=100): """Return rotated components.""" diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 6acd7923bd7bb..566e345ba3fbd 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -21,7 +21,6 @@ from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES from ..utils.validation import _deprecate_positional_args -from ..utils._feature_names import _make_feature_names __all__ = ['fastica', 'FastICA'] @@ -622,19 +621,3 @@ def inverse_transform(self, X, copy=True): X += self.mean_ return X - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.components_.shape[0], - prefix=type(self).__name__.lower()) diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 4866b063986bb..63fcfda41ba7d 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -15,7 +15,6 @@ from ..preprocessing import KernelCenterer from ..metrics.pairwise import pairwise_kernels from ..utils.validation import _deprecate_positional_args -from ..utils._feature_names import _make_feature_names class KernelPCA(TransformerMixin, BaseEstimator): @@ -365,21 +364,5 @@ def inverse_transform(self, X): K.flat[::n_samples + 1] += self.alpha return np.dot(K, self.dual_coef_) - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.lambdas_.shape[0], - prefix=type(self).__name__.lower()) - def _more_tags(self): return {'preserves_dtype': [np.float64, np.float32]} diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index feab5aaa7a1da..14dd87b9db130 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -22,7 +22,6 @@ from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed -from ..utils._feature_names import _make_feature_names from ._online_lda_fast import (mean_change, _dirichlet_expectation_1d, _dirichlet_expectation_2d) @@ -839,19 +838,3 @@ def perplexity(self, X, sub_sampling=False): Perplexity score. """ return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling) - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.n_components, - prefix=type(self).__name__.lower()) diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 816e4682e43e9..4eaf9c29e5703 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -17,7 +17,6 @@ from ..base import BaseEstimator, TransformerMixin from ..exceptions import ConvergenceWarning from ..utils import check_random_state, check_array -from ..utils._feature_names import _make_feature_names from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm from ..utils.validation import check_is_fitted, check_non_negative from ..utils.validation import _deprecate_positional_args @@ -1377,19 +1376,3 @@ def inverse_transform(self, W): """ check_is_fitted(self) return np.dot(W, self.components_) - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.n_components_, - prefix=type(self).__name__.lower()) diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 3bce8e953175e..b850182c24200 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -7,7 +7,6 @@ from ..utils import check_random_state, check_array from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args -from ..utils._feature_names import _make_feature_names from ..linear_model import ridge_regression from ..base import BaseEstimator, TransformerMixin from ._dict_learning import dict_learning, dict_learning_online @@ -206,22 +205,6 @@ def transform(self, X): return U - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.n_components_, - prefix=type(self).__name__.lower()) - class MiniBatchSparsePCA(SparsePCA): """Mini-batch Sparse Principal Components Analysis diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index bd2b89261d7d6..558dcbc69b38b 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -12,7 +12,6 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array, check_random_state -from ..utils._feature_names import _make_feature_names from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip from ..utils.sparsefuncs import mean_variance_axis from ..utils.validation import _deprecate_positional_args @@ -236,19 +235,3 @@ def inverse_transform(self, X): def _more_tags(self): return {'preserves_dtype': [np.float64, np.float32]} - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.components_.shape[0], - prefix=type(self).__name__.lower()) diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 9c4a0ffe7ec09..1e82578e2693b 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -24,7 +24,6 @@ from .utils.extmath import softmax from .preprocessing import StandardScaler from .utils.validation import _deprecate_positional_args -from .utils._feature_names import _make_feature_names __all__ = ['LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis'] @@ -657,24 +656,6 @@ def decision_function(self, X): # Only override for the doc return super().decision_function(X) - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - n_components = self.n_components or np.inf - n_features = min(self._max_components, n_components) - return _make_feature_names(n_features=n_features, - prefix=type(self).__name__.lower()) - class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): """Quadratic Discriminant Analysis diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index bc8d8c82a4f61..a5d752cb3f4b6 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -15,7 +15,6 @@ from ..utils import check_array from ..utils import safe_mask from ..utils import safe_sqr -from ..utils._feature_names import _make_feature_names class SelectorMixin(TransformerMixin, metaclass=ABCMeta): @@ -126,25 +125,6 @@ def inverse_transform(self, X): Xt[:, support] = X return Xt - def get_feature_names_out(self, input_features=None): - """Mask feature names according to selected features. - - Parameters - ---------- - input_features : list of str or None, default=None - Input features to select from. If None, they are generated as - x0, x1, ..., xn. - - Returns - ------- - output_feature_names : ndarray of str - Feature names for transformer output. - """ - mask = self.get_support() - input_features = _make_feature_names(mask.shape[0], - input_features=input_features) - return np.array(input_features)[mask] - def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1): diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 3c2744f176b60..610d5e93a6b03 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -16,7 +16,6 @@ from ..utils.validation import FLOAT_DTYPES from ..utils.validation import _deprecate_positional_args from ..utils._mask import _get_mask -from ..utils._feature_names import _make_feature_names from ..utils import is_scalar_nan @@ -494,30 +493,6 @@ def transform(self, X): def _more_tags(self): return {'allow_nan': True} - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str - Input feature names. - - Returns - ------- - feature_names : list of str - Transformed feature names. - """ - check_is_fitted(self) - input_features = _make_feature_names(self.statistics_.shape[0], - input_features=input_features) - output = np.array(input_features)[self._valid_mask].tolist() - if not self.add_indicator: - return output - missing_names = self.indicator_.get_feature_names_out(input_features) - missing_names = [f'missingindicator__{name}' for name in - missing_names] - return output + missing_names - def inverse_transform(self, X): """Convert the data back to the original representation. @@ -876,20 +851,3 @@ def _more_tags(self): "X_types": ["2darray", "string"], "preserves_dtype": [], } - - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - names = _make_feature_names(n_features=self._n_features, - input_features=input_features) - return [names[i] for i in self.features_] diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 8e5ea6572c27a..f27e5f2b05f8d 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -13,7 +13,6 @@ is_scalar_nan) from ..utils.validation import FLOAT_DTYPES, check_is_fitted from ..utils._mask import _get_mask -from ..utils._feature_names import _make_feature_names from ._base import _BaseImputer from ._base import SimpleImputer @@ -747,29 +746,3 @@ def fit(self, X, y=None): """ self.fit_transform(X) return self - - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str - Input feature names. - - Returns - ------- - feature_names : list of str - Transformed feature names. - """ - check_is_fitted(self) - input_features = _make_feature_names( - self.initial_imputer_.statistics_.shape[0], - input_features=input_features) - output = (np.array(input_features)[self.initial_imputer_._valid_mask] - .tolist()) - if not self.add_indicator: - return output - missing_names = self.indicator_.get_feature_names_out(input_features) - missing_names = [f'missingindicator__{name}' for name in - missing_names] - return output + missing_names diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index be8686a4d15c7..681935c4f6984 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -13,7 +13,6 @@ from ..utils import check_array from ..utils import is_scalar_nan from ..utils._mask import _get_mask -from ..utils._feature_names import _make_feature_names from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args @@ -305,27 +304,3 @@ def process_chunk(dist_chunk, start): pass return super()._concatenate_indicator(X[:, valid_mask], X_indicator) - - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str - Input feature names. - - Returns - ------- - feature_names : ndarray of str - Transformed feature names. - """ - check_is_fitted(self) - input_features = _make_feature_names(self._valid_mask.shape[0], - input_features=input_features) - output = np.array(input_features)[self._valid_mask].tolist() - if not self.add_indicator: - return output - missing_names = self.indicator_.get_feature_names_out(input_features) - missing_names = [f'missingindicator__{name}' for name in - missing_names] - return output + missing_names diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 614bea9f9b54f..9c666272e2f5e 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -21,7 +21,6 @@ from .base import BaseEstimator from .base import TransformerMixin from .utils import check_array, check_random_state, as_float_array -from .utils._feature_names import _make_feature_names from .utils.extmath import safe_sparse_dot from .utils.validation import check_is_fitted from .metrics.pairwise import pairwise_kernels, KERNEL_PARAMS @@ -194,22 +193,6 @@ def transform(self, X): return data_sketch - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.n_components, - prefix=type(self).__name__.lower()) - class RBFSampler(TransformerMixin, BaseEstimator): """Approximates feature map of an RBF kernel by Monte Carlo approximation @@ -326,22 +309,6 @@ def transform(self, X): projection *= np.sqrt(2.) / np.sqrt(self.n_components) return projection - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.n_components, - prefix=type(self).__name__.lower()) - class SkewedChi2Sampler(TransformerMixin, BaseEstimator): """Approximates feature map of the "skewed chi-squared" kernel by Monte @@ -466,21 +433,6 @@ def transform(self, X): projection *= np.sqrt(2.) / np.sqrt(self.n_components) return projection - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.n_components, - prefix=type(self).__name__.lower()) class AdditiveChi2Sampler(TransformerMixin, BaseEstimator): """Approximate feature map for additive chi2 kernel. @@ -668,23 +620,6 @@ def _transform_sparse(self, X): return sp.hstack(X_new) - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - n_features = self.n_features_in_ * (2 * self.sample_steps - 1) - return _make_feature_names(n_features=n_features, - prefix=type(self).__name__.lower()) - def _more_tags(self): return {'stateless': True, 'requires_positive_X': True} @@ -879,22 +814,6 @@ def _get_kernel_params(self): return params - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.components_.shape[0], - prefix=type(self).__name__.lower()) - def _more_tags(self): return { '_xfail_checks': { diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index df517fe2f3753..d843c3ddd8462 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -9,7 +9,6 @@ from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args from ..utils.graph import graph_shortest_path -from ..utils._feature_names import _make_feature_names from ..decomposition import KernelPCA from ..preprocessing import KernelCenterer @@ -271,20 +270,3 @@ def transform(self, X): G_X *= -0.5 return self.kernel_pca_.transform(G_X) - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names( - n_features=self.kernel_pca_.lambdas_.shape[0], - prefix=type(self).__name__.lower()) diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index e57f1db72b0a8..d9c02416bbb68 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -15,7 +15,6 @@ from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES from ..utils.validation import _deprecate_positional_args -from ..utils._feature_names import _make_feature_names from ..neighbors import NearestNeighbors @@ -734,19 +733,3 @@ def transform(self, X): for i in range(X.shape[0]): X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i]) return X_new - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.n_components, - prefix=type(self).__name__.lower()) diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index e553dbf100d42..538df74ece1e4 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -9,7 +9,6 @@ from ._unsupervised import NearestNeighbors from ..base import TransformerMixin from ..utils.validation import check_is_fitted, _deprecate_positional_args -from ..utils._feature_names import _make_feature_names def _check_params(X, metric, p, metric_params): @@ -371,23 +370,6 @@ def fit_transform(self, X, y=None): """ return self.fit(X).transform(X) - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names( - n_features=self.n_samples_fit_, - prefix=type(self).__name__.lower()) - class RadiusNeighborsTransformer(RadiusNeighborsMixin, TransformerMixin, @@ -561,20 +543,3 @@ def fit_transform(self, X, y=None): The matrix is of CSR format. """ return self.fit(X).transform(X) - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names( - n_features=self.n_samples_fit_, - prefix=type(self).__name__.lower()) diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 3e6d7f6fe8a00..8920b2d99ed02 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -20,7 +20,6 @@ from ..base import BaseEstimator, TransformerMixin from ..preprocessing import LabelEncoder from ..decomposition import PCA -from ..utils._feature_names import _make_feature_names from ..utils.multiclass import check_classification_targets from ..utils.random import check_random_state from ..utils.validation import check_is_fitted, check_array, check_scalar @@ -524,21 +523,5 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): return sign * loss, sign * gradient.ravel() - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.components_.shape[0], - prefix=type(self).__name__.lower()) - def _more_tags(self): return {'requires_y': True} diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index 07bcd5427db74..d1028911f4185 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -21,7 +21,6 @@ from ..utils.extmath import safe_sparse_dot from ..utils.extmath import log_logistic from ..utils.validation import check_is_fitted, _deprecate_positional_args -from ..utils._feature_names import _make_feature_names class BernoulliRBM(TransformerMixin, BaseEstimator): @@ -386,19 +385,3 @@ def _more_tags(self): 'fails for the decision_function method' } } - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.components_.shape[0], - prefix=type(self).__name__.lower()) diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 15ab125954ddd..4623ac1ab64e4 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -36,7 +36,6 @@ from .base import BaseEstimator, TransformerMixin from .utils import check_random_state -from .utils._feature_names import _make_feature_names from .utils.extmath import safe_sparse_dot from .utils.random import sample_without_replacement from .utils.validation import check_array, check_is_fitted @@ -417,22 +416,6 @@ def transform(self, X): dense_output=self.dense_output) return X_new - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.n_components_, - prefix=type(self).__name__.lower()) - class GaussianRandomProjection(BaseRandomProjection): """Reduce dimensionality through Gaussian random projection. diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index fb813938ddbbd..67a558c695442 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -275,12 +275,10 @@ def test_strict_mode_parametrize_with_checks(estimator, check): # this list to be tested get_feature_names_out_MODULES_TO_IGNORE = [ 'cluster', - 'compose', 'cross_decomposition', 'decomposition', 'discriminant_analysis', 'ensemble', - 'feature_extraction', 'feature_selection', 'impute', 'isotonic', @@ -288,8 +286,6 @@ def test_strict_mode_parametrize_with_checks(estimator, check): 'manifold', 'neighbors', 'neural_network', - 'pipeline', - 'preprocessing', 'random_projection' ] @@ -304,4 +300,4 @@ def test_strict_mode_parametrize_with_checks(estimator, check): @pytest.mark.parametrize("transformer", get_feature_names_out_ESTIMATORS) def test_transformers_get_feature_names_out(transformer): check_transformer_get_feature_names_out(type(transformer).__name__, - transformer) + transformer) From 21cbfe646df55a66044c01a173dd44c6204d50e1 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Sep 2020 13:15:13 -0400 Subject: [PATCH 042/100] CLN Reduces the number of diffs --- doc/modules/feature_extraction.rst | 6 ++-- .../plot_feature_selection_pipeline.py | 9 ++--- .../compose/tests/test_column_transformer.py | 2 +- sklearn/datasets/descr/twenty_newsgroups.rst | 2 +- sklearn/decomposition/_dict_learning.py | 33 ------------------- sklearn/feature_extraction/text.py | 1 + sklearn/impute/_base.py | 5 --- sklearn/impute/_knn.py | 3 +- sklearn/impute/tests/test_common.py | 28 ---------------- sklearn/impute/tests/test_impute.py | 13 -------- sklearn/preprocessing/_encoders.py | 2 +- sklearn/tests/test_pipeline.py | 4 +-- 12 files changed, 13 insertions(+), 95 deletions(-) diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst index 0db3f9f3cdc18..9307a5fb17f86 100644 --- a/doc/modules/feature_extraction.rst +++ b/doc/modules/feature_extraction.rst @@ -1,4 +1,4 @@ -.. _feature_extraction: +.. _feature_extraction: ================== Feature extraction @@ -406,8 +406,8 @@ however, similar words are useful for prediction, such as in classifying writing style or personality. There are several known issues in our provided 'english' stop word list. It -does not aim to be a general, 'one-size-fits-all' solution as some tasks -may require a more custom solution. See [NQY18]_ for more details. +does not aim to be a general, 'one-size-fits-all' solution as some tasks +may require a more custom solution. See [NQY18]_ for more details. Please take care in choosing a stop word list. Popular stop word lists may include words that are highly informative to diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py index 44cdb1365ed72..d34375c4c2aa9 100644 --- a/examples/feature_selection/plot_feature_selection_pipeline.py +++ b/examples/feature_selection/plot_feature_selection_pipeline.py @@ -9,7 +9,6 @@ Using a sub-pipeline, the fitted coefficients can be mapped back into the original feature space. """ -import matplotlib.pyplot as plt from sklearn import svm from sklearn.datasets import make_classification from sklearn.feature_selection import SelectKBest, f_classif @@ -21,7 +20,7 @@ # import some data to play with X, y = make_classification( - n_features=20, n_informative=3, n_redundant=0, n_classes=2, + n_features=20, n_informative=3, n_redundant=0, n_classes=4, n_clusters_per_class=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) @@ -37,7 +36,5 @@ y_pred = anova_svm.predict(X_test) print(classification_report(y_test, y_pred)) -# access and plot the coefficients of the fitted model -plt.barh((0, 1, 2), anova_svm[-1].coef_.ravel()) -plt.yticks((0, 1, 2), anova_svm[:-1].get_feature_names_out()) -plt.show() +coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_) +print(coef) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 7ac802375a397..f9a7ebd9bc2eb 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1423,7 +1423,7 @@ def test_output_names_empty_columns(empty_col): ct.fit(df) assert ct.get_feature_names_out() == ['ohe__col1_a', 'ohe__col1_b', - 'ohe__col2_z'] + 'ohe__col2_z'] @pytest.mark.parametrize('remainder', ["passthrough", StandardScaler()]) diff --git a/sklearn/datasets/descr/twenty_newsgroups.rst b/sklearn/datasets/descr/twenty_newsgroups.rst index eb3e11e1b6cbc..3814fb97d1e42 100644 --- a/sklearn/datasets/descr/twenty_newsgroups.rst +++ b/sklearn/datasets/descr/twenty_newsgroups.rst @@ -116,7 +116,7 @@ components by sample in a more than 30000-dimensional space >>> vectors.nnz / float(vectors.shape[0]) 159.01327... -:func:`sklearn.datasets.fetch_20newsgroups_vectorized` is a function which +:func:`sklearn.datasets.fetch_20newsgroups_vectorized` is a function which returns ready-to-use token counts features instead of file names. .. _`20 newsgroups website`: http://people.csail.mit.edu/jrennie/20Newsgroups/ diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 0e22e8afb2008..71cbfde40d1c6 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -20,7 +20,6 @@ from ..utils.extmath import randomized_svd, row_norms from ..utils.validation import check_is_fitted, _deprecate_positional_args from ..utils.fixes import delayed -from ..utils._feature_names import _make_feature_names from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars @@ -1358,22 +1357,6 @@ def fit(self, X, y=None): self.error_ = E return self - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.components_.shape[0], - prefix=type(self).__name__.lower()) - class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): """Mini-batch dictionary learning @@ -1665,19 +1648,3 @@ def partial_fit(self, X, y=None, iter_offset=None): self.inner_stats_ = (A, B) self.iter_offset_ = iter_offset + 1 return self - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - output_feature_names : list of str - Feature names for transformer output. - """ - return _make_feature_names(n_features=self.components_.shape[0], - prefix=type(self).__name__.lower()) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 5b370f772a2d8..52c5586ea9cdc 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1300,6 +1300,7 @@ def get_feature_names_out(self, input_features=None): Feature names for transformer output. """ self._check_vocabulary() + return [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))] diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index d46aa2f5798b3..20b22224d53c7 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -320,8 +320,6 @@ def fit(self, X, y=None): self.missing_values, fill_value) - invalid_mask = _get_mask(self.statistics_, np.nan) - self._valid_mask = np.logical_not(invalid_mask) return self def _sparse_fit(self, X, strategy, missing_values, fill_value): @@ -490,9 +488,6 @@ def transform(self, X): return super()._concatenate_indicator(X, X_indicator) - def _more_tags(self): - return {'allow_nan': True} - def inverse_transform(self, X): """Convert the data back to the original representation. diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index 681935c4f6984..df66e4a20aff6 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -188,7 +188,6 @@ def fit(self, X, y=None): _check_weights(self.weights) self._fit_X = X self._mask_fit_X = _get_mask(self._fit_X, self.missing_values) - self._valid_mask = ~np.all(self._mask_fit_X, axis=0) super()._fit_indicator(self._mask_fit_X) @@ -223,7 +222,7 @@ def transform(self, X): mask = _get_mask(X, self.missing_values) mask_fit_X = self._mask_fit_X - valid_mask = self._valid_mask + valid_mask = ~np.all(mask_fit_X, axis=0) X_indicator = super()._transform_indicator(mask) diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py index 00a8afd62d1ec..220a335c15285 100644 --- a/sklearn/impute/tests/test_common.py +++ b/sklearn/impute/tests/test_common.py @@ -113,31 +113,3 @@ def test_imputers_pandas_na_integer_array_support(imputer, add_indicator): X_trans = imputer.fit_transform(X_df) assert_allclose(X_trans_expected, X_trans) - - -# ConvergenceWarning will be raised by the IterativeImputer -@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") -@pytest.mark.parametrize("marker", [np.nan, -1, 0]) -@pytest.mark.parametrize("imputer", IMPUTERS) -def test_imputers_marker(marker, imputer): - X = np.array([ - [marker, 1, 5, marker, 1], - [2, marker, 1, marker, 2], - [6, 3, marker, marker, 3], - [1, 2, 9, marker, 4] - ]) - imputer.set_params(missing_values=marker, add_indicator=False) - feature_names_in = [f'feat{i}' for i in range(X.shape[1])] - expected_features_out = ['feat0', 'feat1', 'feat2', 'feat4'] - imputer.fit(X) - assert_array_equal(imputer.get_feature_names_out(feature_names_in), - expected_features_out) - - imputer.set_params(missing_values=marker, add_indicator=True) - expected_features_out = ( - ['feat0', 'feat1', 'feat2', 'feat4'] + - [f'missingindicator__feat{i}' for i in [0, 1, 2, 3]]) - - imputer.fit(X) - assert_array_equal(imputer.get_feature_names_out(feature_names_in), - expected_features_out) diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py index dc9c0e17e5fa6..4c0918b9a3230 100644 --- a/sklearn/impute/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -1474,16 +1474,3 @@ def test_simple_imputation_inverse_transform_exceptions(missing_value): with pytest.raises(ValueError, match=f"Got 'add_indicator={imputer.add_indicator}'"): imputer.inverse_transform(X_1_trans) - - -@pytest.mark.parametrize( - "features, expected_names", - [("missing-only", ["feat0", "feat2"]), - ("all", ["feat0", "feat1", "feat2"])]) -def test_missing_indicator_get_feature_names_out(features, expected_names): - # output names are correct for missing indicator - X = np.array([[1, 0, np.nan], - [np.nan, 1, 1]]) - indicator = MissingIndicator(features=features).fit(X) - assert_array_equal(indicator.get_feature_names_out(["feat0", "feat1", "feat2"]), - expected_names) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 8c95d57d364b5..12cd22b38d23c 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -592,7 +592,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names : array-like of str + feature_names : list of str Transformed feature names. """ check_is_fitted(self) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 4715d90f6ff0e..0fe70b1c8fb36 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -176,8 +176,8 @@ def test_pipeline_init(): clf = NoTrans() pipe = Pipeline([('svc', clf)]) assert (pipe.get_params(deep=True) == - dict(svc__a=None, svc__b=None, svc=clf, - **pipe.get_params(deep=False))) + dict(svc__a=None, svc__b=None, svc=clf, + **pipe.get_params(deep=False))) # Check that params are set pipe.set_params(svc__a=0.1) From 86887ae7afad9494b82c6f21af058b0070e48b06 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Sep 2020 16:37:37 -0400 Subject: [PATCH 043/100] CLN Less diffs --- doc/modules/compose.rst | 4 +- .../plot_column_transformer_mixed_types.py | 6 +- sklearn/compose/_column_transformer.py | 71 +++++++++---------- .../compose/tests/test_column_transformer.py | 29 ++------ 4 files changed, 43 insertions(+), 67 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index efedba2376b06..b87bccf739f5f 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -139,7 +139,7 @@ or by name:: >>> pipe['reduce_dim'] PCA() -To enable model inspection, `Pipeline` has an ``get_feature_names_out()`` method, +To enable model inspection, `Pipeline` has a ``get_feature_names_out()`` method, just like all transformers. You can use pipeline slicing to get the feature names going into each step:: @@ -154,7 +154,7 @@ going into each step:: >>> pipe[:-1].get_feature_names_out() array(['x2', 'x3'], dtype='>> pipe[:-1].get_feature_names_out(iris.feature_names) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 56f57bd830440..0403fb1642c15 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -149,12 +149,12 @@ # %% -# Inspecting the coefficients values of the classifier +# Inspecting the coefficients of the classifier ############################################################################### # The coefficients of the final classification step of the pipeline gives an # idea how each feature impacts the likelihood of survival assuming that the # usual linear model assumptions hold (uncorrelated features, linear -# separability, homoschedastic errors...) which we do not verify in this +# separability, homoscedastic errors...) which we do not verify in this # example. # # To get error bars we perform cross-validation and compute the mean and @@ -170,7 +170,7 @@ # were the first to reach the lifeboats, and often, priority was given to women # and children. # -# Note that conditionned on the "pclass_x" one-hot features, the "fare" +# Note that conditioned on the "pclass_x" one-hot features, the "fare" # numerical feature does not seem to be significantly predictive. If we drop # the "pclass" feature, then higher "fare" values would appear significantly # correlated with a higher likelihood of survival as the "fare" and "pclass" diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 4191fe3461a8c..3ecffcf619b91 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -350,15 +350,10 @@ def named_transformers_(self): return Bunch(**{name: trans for name, trans, _ in self.transformers_}) - @deprecated("get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_feature_names_out instead") - def get_feature_names(self): - """Get feature names from all transformers. - - Returns - ------- - feature_names : list of strings - Names of the features produced by transform. + def _get_feature_names_out(self, get_names): + """Private function to be used by get_feature_names_out and + get_feature_names. This should be removed and integrated into + get_feature_names_out when get_feature_names is deprecated. """ check_is_fitted(self) feature_names = [] @@ -377,14 +372,27 @@ def get_feature_names(self): indices = np.arange(self._n_features) feature_names.extend(['x%d' % i for i in indices[column]]) continue - if not hasattr(trans, 'get_feature_names'): - raise AttributeError("Transformer %s (type %s) does not " - "provide get_feature_names." - % (str(name), type(trans).__name__)) - feature_names.extend([name + "__" + f for f in - trans.get_feature_names()]) + feature_names.extend(get_names(name, trans, column)) return feature_names + @deprecated("get_feature_names is deprecated in 0.24 and will be removed " + "in 0.26. You can use get_feature_names_out instead") + def get_feature_names(self): + """Get feature names from all transformers. + + Returns + ------- + feature_names : list of strings + Names of the features produced by transform. + """ + def get_names(name, trans, column): + if not hasattr(trans, 'get_feature_names'): + raise AttributeError( + f"Transformer {name} (type {type(trans).__name__}) does " + "not provide get_feature_names.") + return [f"{name}__{f}" for f in trans.get_feature_names()] + return self._get_feature_names_out(get_names) + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. @@ -398,31 +406,16 @@ def get_feature_names_out(self, input_features=None): output_feature_names : list of str Transformed feature names. """ - check_is_fitted(self) - feature_names = [] - for name, trans, column, _ in self._iter(fitted=True): - if trans == 'drop' or ( - hasattr(column, '__len__') and not len(column)): - continue - if trans == 'passthrough': - if hasattr(self, '_df_columns'): - if ((not isinstance(column, slice)) - and all(isinstance(col, str) for col in column)): - feature_names.extend(column) - else: - feature_names.extend(self._df_columns[column]) - else: - indices = np.arange(self._n_features) - feature_names.extend(['x%d' % i for i in indices[column]]) - continue + + def get_names(name, trans, column): if not hasattr(trans, 'get_feature_names_out'): - raise AttributeError("Transformer %s (type %s) does not " - "provide get_feature_names_out." - % (str(name), type(trans).__name__)) - feature_names.extend( - [name + "__" + f - for f in trans.get_feature_names_out(input_features=column)]) - return feature_names + raise AttributeError( + f"Transformer {name} (type {type(trans).__name__}) does " + "not provide get_feature_names_out.") + return [f"{name}__{f}" + for f in + trans.get_feature_names_out(input_features=column)] + return self._get_feature_names_out(get_names) def _update_fitted_transformers(self, transformers): # transformers are fitted; excludes 'drop' cases diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index f9a7ebd9bc2eb..cf341c823298e 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1389,27 +1389,11 @@ def test_make_column_selector_pickle(): 'empty_col', [[], np.array([], dtype=int), lambda x: []], ids=['list', 'array', 'callable'] ) -def test_feature_names_empty_columns(empty_col): - pd = pytest.importorskip('pandas') - - df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) - - ct = ColumnTransformer( - transformers=[ - ("ohe", OneHotEncoder(), ["col1", "col2"]), - ("empty_features", OneHotEncoder(), empty_col), - ], - ) - - ct.fit(df) - assert ct.get_feature_names() == ['ohe__x0_a', 'ohe__x0_b', 'ohe__x1_z'] - - -@pytest.mark.parametrize( - 'empty_col', [[], np.array([], dtype=int), lambda x: []], - ids=['list', 'array', 'callable'] -) -def test_output_names_empty_columns(empty_col): +@pytest.mark.parametrize("get_names, expected_names", [ + ("get_feature_names", ['ohe__x0_a', 'ohe__x0_b', 'ohe__x1_z']), + ("get_feature_names_out", ['ohe__col1_a', 'ohe__col1_b', 'ohe__col2_z']) +]) +def test_feature_names_empty_columns(empty_col, get_names, expected_names): pd = pytest.importorskip('pandas') df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) @@ -1422,8 +1406,7 @@ def test_output_names_empty_columns(empty_col): ) ct.fit(df) - assert ct.get_feature_names_out() == ['ohe__col1_a', 'ohe__col1_b', - 'ohe__col2_z'] + assert getattr(ct, get_names)() == expected_names @pytest.mark.parametrize('remainder', ["passthrough", StandardScaler()]) From 8ecb38f1a7f19c74fab503a0a84b12b511754d52 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Sep 2020 17:59:16 -0400 Subject: [PATCH 044/100] CLN Refactor into _get_feature_names_out --- sklearn/compose/_column_transformer.py | 13 ++- .../compose/tests/test_column_transformer.py | 27 ++++++ sklearn/tests/test_pipeline.py | 82 ------------------- 3 files changed, 39 insertions(+), 83 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 3ecffcf619b91..d8283f52dc695 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -8,6 +8,7 @@ # License: BSD import warnings from itertools import chain +from typing import Iterable import numbers import numpy as np @@ -22,6 +23,7 @@ from ..utils import _safe_indexing from ..utils import _get_column_indices from ..utils import _determine_key_type +from ..utils._feature_names import _make_feature_names from ..utils.metaestimators import _BaseComposition from ..utils.validation import check_array, check_is_fitted from ..utils.validation import _deprecate_positional_args @@ -355,7 +357,6 @@ def _get_feature_names_out(self, get_names): get_feature_names. This should be removed and integrated into get_feature_names_out when get_feature_names is deprecated. """ - check_is_fitted(self) feature_names = [] for name, trans, column, _ in self._iter(fitted=True): if trans == 'drop' or ( @@ -385,6 +386,8 @@ def get_feature_names(self): feature_names : list of strings Names of the features produced by transform. """ + check_is_fitted(self) + def get_names(name, trans, column): if not hasattr(trans, 'get_feature_names'): raise AttributeError( @@ -406,12 +409,20 @@ def get_feature_names_out(self, input_features=None): output_feature_names : list of str Transformed feature names. """ + check_is_fitted(self) + if hasattr(self, '_df_columns'): + input_names = self._df_columns + else: + input_names = _make_feature_names(self.n_features_in_) def get_names(name, trans, column): if not hasattr(trans, 'get_feature_names_out'): raise AttributeError( f"Transformer {name} (type {type(trans).__name__}) does " "not provide get_feature_names_out.") + if (isinstance(column, Iterable) and + not all(isinstance(col, str) for col in column)): + column = _safe_indexing(input_names, column) return [f"{name}__{f}" for f in trans.get_feature_names_out(input_features=column)] diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index cf341c823298e..f02be6f847de2 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1409,6 +1409,33 @@ def test_feature_names_empty_columns(empty_col, get_names, expected_names): assert getattr(ct, get_names)() == expected_names +@pytest.mark.parametrize("selector", [ + [1], lambda x: [1], ["col2"], lambda x: ["col2"], + [False, True], lambda x: [False, True] +]) +def test_feature_names_out_pandas(selector): + # checks name when selecting only the second column + pd = pytest.importorskip('pandas') + df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) + ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)]) + ct.fit(df) + + assert ct.get_feature_names_out() == ["ohe__col2_z"] + + +@pytest.mark.parametrize("selector", [ + [1], lambda x: [1], + [False, True], lambda x: [False, True] +]) +def test_feature_names_out_non_pandas(selector): + # checks name when selecting the second column with numpy array + X = [["a", "z"], ["a", "z"], ["b", "z"]] + ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)]) + ct.fit(X) + + assert ct.get_feature_names_out() == ["ohe__x1_z"] + + @pytest.mark.parametrize('remainder', ["passthrough", StandardScaler()]) def test_sk_visual_block_remainder(remainder): # remainder='passthrough' or an estimator will be shown in repr_html diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 0fe70b1c8fb36..0b69152b7628f 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -21,14 +21,12 @@ from sklearn.utils._testing import assert_no_warnings from sklearn.utils.fixes import parse_version -from sklearn.exceptions import NotFittedError from sklearn.base import clone, BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union from sklearn.svm import SVC from sklearn.neighbors import LocalOutlierFactor from sklearn.linear_model import LogisticRegression, Lasso from sklearn.linear_model import LinearRegression -from sklearn.multiclass import OneVsRestClassifier from sklearn.cluster import KMeans from sklearn.feature_selection import SelectKBest, f_classif from sklearn.dummy import DummyRegressor @@ -1141,56 +1139,6 @@ def test_make_pipeline_memory(): shutil.rmtree(cachedir) -def test_feature_names_basic(): - pipe = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='median')), - ('scaler', StandardScaler()), - ('select', SelectKBest(k=2)), - ('clf', LogisticRegression())]) - with pytest.raises(NotFittedError): - pipe.get_feature_names_out() - iris = load_iris() - pipe.fit(iris.data, iris.target) - xs = np.array(['x0', 'x1', 'x2', 'x3']) - assert_array_equal(pipe[:1].get_feature_names_out(), xs) - mask = pipe.named_steps.select.get_support() - assert_array_equal(pipe[:-1].get_feature_names_out(), xs[mask]) - with pytest.raises( - TypeError, - match="Estimator clf does provide get_feature_names_out."): - pipe.get_feature_names_out(iris.feature_names) - assert_array_equal(pipe[:1].get_feature_names_out(iris.feature_names), - iris.feature_names) - assert_array_equal(pipe[:-1].get_feature_names_out(iris.feature_names), - np.array(iris.feature_names)[mask]) - pipe = Pipeline(steps=[ - ('scaler', StandardScaler()), - ('pca', PCA(n_components=3)), - ('select', SelectKBest(k=2)), - ('clf', LogisticRegression())]) - pipe.fit(iris.data, iris.target) - assert_array_equal(pipe[:-1].get_feature_names_out(), ['pca0', 'pca1']) - # setting names doesn't change names after PCA - assert_array_equal(pipe[:-2].get_feature_names_out(iris.feature_names), - ['pca0', 'pca1', 'pca2']) - - -def test_input_feature_names_pandas(): - pd = pytest.importorskip("pandas") - pipe = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='median')), - ('scaler', StandardScaler()), - ('select', SelectKBest(k=2)), - ('clf', LogisticRegression())]) - iris = load_iris() - df = pd.DataFrame(iris.data, columns=iris.feature_names) - pipe.fit(df, iris.target) - mask = pipe.named_steps.select.get_support() - # for now assuming we have to pass these explicitly - assert_array_equal(pipe[:-1].get_feature_names_out(iris.feature_names), - np.array(iris.feature_names)[mask]) - - def test_features_names_passthrough(): pipe = Pipeline(steps=[ ('imputer', 'passthrough'), @@ -1217,36 +1165,6 @@ def test_feature_names_count_vectorizer(): ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) -def test_feature_names_nested(): - pipe = Pipeline(steps=[ - ('inner_pipe', Pipeline(steps=[('select', SelectKBest(k=2)), - ('clf', LogisticRegression())]))]) - iris = load_iris() - pipe.fit(iris.data, iris.target) - xs = np.array(['x0', 'x1', 'x2', 'x3']) - mask = pipe.named_steps.inner_pipe.named_steps.select.get_support() - assert_array_equal( - pipe.named_steps.inner_pipe[:1].get_feature_names_out(), xs[mask]) - assert_array_equal( - pipe.named_steps.inner_pipe[:1].get_feature_names_out(iris.feature_names), - np.array(iris.feature_names)[mask]) - - -def test_feature_names_meta_pipe(): - ovr = OneVsRestClassifier(Pipeline(steps=[('select', SelectKBest(k=2)), - ('clf', LogisticRegression())])) - pipe = Pipeline(steps=[('ovr', ovr)]) - iris = load_iris() - pipe.fit(iris.data, iris.target) - xs = np.array(['x0', 'x1', 'x2', 'x3']) - # check 0ths estimator in OVR only - inner_pipe = pipe['ovr'].estimators_[0] - mask = inner_pipe['select'].get_support() - assert_array_equal(inner_pipe[:-1].get_feature_names_out(), xs[mask]) - assert_array_equal(inner_pipe[:-1].get_feature_names_out(iris.feature_names), - np.array(iris.feature_names)[mask]) - - def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises(ValueError, match="Pipeline.fit does not accept " From 8b3c8569b88c7ae2d1a3972b584dbea5a9f14ce5 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Sep 2020 18:00:07 -0400 Subject: [PATCH 045/100] STY Lint fixes --- sklearn/tests/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index b902426282984..c5a25bceb0e05 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -273,8 +273,8 @@ def test_strict_mode_parametrize_with_checks(estimator, check): check(estimator) -# TODO: As more modules support get_feature_names_out they should be removed from -# this list to be tested +# TODO: As more modules support get_feature_names_out they should be removed +# from this list to be tested get_feature_names_out_MODULES_TO_IGNORE = [ 'cluster', 'cross_decomposition', From cf1ec1eab66337c27e3796ebe28cb2d65d3fe3ee Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 1 Oct 2020 09:36:37 -0400 Subject: [PATCH 046/100] CLN Remove example since get_names is not implemented everywhere --- .../plot_column_transformer_mixed_types.py | 44 ------------------- 1 file changed, 44 deletions(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 0403fb1642c15..539f38a60bbe4 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -147,50 +147,6 @@ clf.fit(X_train, y_train) print("model score: %.3f" % clf.score(X_test, y_test)) - -# %% -# Inspecting the coefficients of the classifier -############################################################################### -# The coefficients of the final classification step of the pipeline gives an -# idea how each feature impacts the likelihood of survival assuming that the -# usual linear model assumptions hold (uncorrelated features, linear -# separability, homoscedastic errors...) which we do not verify in this -# example. -# -# To get error bars we perform cross-validation and compute the mean and -# standard deviation for each coefficient accross CV splits. Because we use a -# standard scaler on the numerical features, the coefficient weights gives us -# an idea on how much the log odds of surviving are impacted by a change in -# this dimension contrasted to the mean. Note that the categorical features -# here are overspecified which makes it slightly harder to interpret because of -# the information redundancy. -# -# We can see that the linear model coefficients are in agreement with the -# historical reports: people in higher classes and therefore in the upper decks -# were the first to reach the lifeboats, and often, priority was given to women -# and children. -# -# Note that conditioned on the "pclass_x" one-hot features, the "fare" -# numerical feature does not seem to be significantly predictive. If we drop -# the "pclass" feature, then higher "fare" values would appear significantly -# correlated with a higher likelihood of survival as the "fare" and "pclass" -# features have a strong statistical dependency. - -import matplotlib.pyplot as plt -from sklearn.model_selection import cross_validate -from sklearn.model_selection import StratifiedShuffleSplit - -cv = StratifiedShuffleSplit(n_splits=20, test_size=0.25, random_state=42) -cv_results = cross_validate(clf, X_train, y_train, cv=cv, - return_estimator=True) -cv_coefs = np.concatenate([cv_pipeline[-1].coef_ - for cv_pipeline in cv_results["estimator"]]) -fig, ax = plt.subplots() -ax.barh(clf[:-1].get_feature_names_out(), - cv_coefs.mean(axis=0), xerr=cv_coefs.std(axis=0)) -plt.tight_layout() -plt.show() - # %% # The resulting score is not exactly the same as the one from the previous # pipeline becase the dtype-based selector treats the ``pclass`` columns as From a63cd14910149e7fe71e1ff5e63e8600039cf68f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 1 Oct 2020 09:44:22 -0400 Subject: [PATCH 047/100] ENH Adds feature_selection for the example --- doc/modules/compose.rst | 4 ++-- sklearn/feature_selection/_base.py | 20 ++++++++++++++++++++ sklearn/tests/test_common.py | 9 ++++----- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index b87bccf739f5f..5f46ae65ca70d 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -152,13 +152,13 @@ going into each step:: >>> pipe.fit(iris.data, iris.target) Pipeline(steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))]) >>> pipe[:-1].get_feature_names_out() - array(['x2', 'x3'], dtype='>> pipe[:-1].get_feature_names_out(iris.feature_names) - array(['petal length (cm)', 'petal width (cm)'], dtype=' Date: Fri, 2 Oct 2020 14:35:51 -0400 Subject: [PATCH 048/100] TST Fixes KBins --- sklearn/preprocessing/_discretization.py | 2 +- sklearn/tests/test_common.py | 4 +--- sklearn/utils/estimator_checks.py | 4 ++++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index f6138161342cc..7735a97a24cd1 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -371,4 +371,4 @@ def get_feature_names_out(self, input_features=None): output_feature_names : list of str Feature names for transformer output. """ - return self._encoder.get_feature_names(input_features) + return self._encoder.get_feature_names_out(input_features) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 990577cf63053..3c851f2faa8ad 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -292,9 +292,7 @@ def test_strict_mode_parametrize_with_checks(estimator, check): GET_FEATURES_OUT_ESTIMATORS = [ est for est in _tested_estimators('transformer') - if "2darray" in est._get_tags()["X_types"] and - not est._get_tags()["no_validation"] and - est.__module__.split('.')[1] not in GET_FEATURES_OUT_MODULES_TO_IGNORE + if est.__module__.split('.')[1] not in GET_FEATURES_OUT_MODULES_TO_IGNORE ] diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 080fa3524f100..037f23e0883a4 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3129,6 +3129,10 @@ def check_requires_y_none(name, estimator_orig, strict_mode=True): def check_transformer_get_feature_names_out(name, transformer_orig, strict_mode=True): + tags = transformer_orig._get_tags() + if "2darray" not in tags["X_types"] or tags["no_validation"]: + return + X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) From 6f35c0c80c6d14779ef467858fe3c6dc07c071a1 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 5 Oct 2020 10:37:39 -0400 Subject: [PATCH 049/100] DOC Update glossary --- doc/glossary.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/glossary.rst b/doc/glossary.rst index 80583de4e8a74..1a0e5f409b7d7 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -1243,7 +1243,10 @@ Methods the estimator's :term:`transform` method. It outputs a list of strings and may take a list of strings as input, corresponding to the names of input columns from which output column names can - be generated. By default input features are named x0, x1, .... + be generated. If `feature_names_in` is not passed in, then the + `feature_names_in_` attribute will be used. If the + `feature_names_in_` attribute is not defined or `None`, then the + input names are named x0, x1, ..., xn_features_out. ``get_n_splits`` On a :term:`CV splitter` (not an estimator), returns the number of From f7c0062871e1a1a7761345f11cdc498bbd4a41ee Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Jun 2021 12:10:30 -0400 Subject: [PATCH 050/100] STY Runs black --- sklearn/compose/_column_transformer.py | 33 ++++--- .../compose/tests/test_column_transformer.py | 72 +++++++++------- .../feature_extraction/_dict_vectorizer.py | 5 +- .../tests/test_dict_vectorizer.py | 2 +- sklearn/feature_extraction/tests/test_text.py | 19 ++-- sklearn/feature_extraction/text.py | 6 +- sklearn/pipeline.py | 25 +++--- sklearn/preprocessing/_data.py | 5 +- sklearn/preprocessing/_encoders.py | 6 +- sklearn/preprocessing/_polynomial.py | 49 ++++++++++- sklearn/preprocessing/tests/test_encoders.py | 2 +- .../preprocessing/tests/test_polynomial.py | 21 ++++- sklearn/tests/test_pipeline.py | 86 ++++++++++--------- sklearn/utils/_feature_names.py | 2 +- sklearn/utils/estimator_checks.py | 26 +++--- .../utils/tests/test_make_feature_names.py | 18 ++-- 16 files changed, 230 insertions(+), 147 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index c05b0299ac69b..cfec4532f1234 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -20,6 +20,7 @@ from ..utils import Bunch from ..utils import _safe_indexing from ..utils import _get_column_indices +from ..utils.deprecation import deprecated from ..utils._feature_names import _make_feature_names from ..utils.metaestimators import _BaseComposition from ..utils.validation import check_array, check_is_fitted @@ -395,8 +396,10 @@ def _get_feature_names_out(self, get_names): feature_names.extend(get_names(name, trans, column)) return feature_names - @deprecated("get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_feature_names_out instead") + @deprecated( + "get_feature_names is deprecated in 1.0 and will be removed " + "in 1.2. You can use get_feature_names_out instead" + ) def get_feature_names(self): """Get feature names from all transformers. @@ -408,11 +411,13 @@ def get_feature_names(self): check_is_fitted(self) def get_names(name, trans, column): - if not hasattr(trans, 'get_feature_names'): + if not hasattr(trans, "get_feature_names"): raise AttributeError( f"Transformer {name} (type {type(trans).__name__}) does " - "not provide get_feature_names.") + "not provide get_feature_names." + ) return [f"{name}__{f}" for f in trans.get_feature_names()] + return self._get_feature_names_out(get_names) def get_feature_names_out(self, input_features=None): @@ -429,22 +434,26 @@ def get_feature_names_out(self, input_features=None): Transformed feature names. """ check_is_fitted(self) - if hasattr(self, '_df_columns'): + if hasattr(self, "_df_columns"): input_names = self._df_columns else: input_names = _make_feature_names(self.n_features_in_) def get_names(name, trans, column): - if not hasattr(trans, 'get_feature_names_out'): + if not hasattr(trans, "get_feature_names_out"): raise AttributeError( f"Transformer {name} (type {type(trans).__name__}) does " - "not provide get_feature_names_out.") - if (isinstance(column, Iterable) and - not all(isinstance(col, str) for col in column)): + "not provide get_feature_names_out." + ) + if isinstance(column, Iterable) and not all( + isinstance(col, str) for col in column + ): column = _safe_indexing(input_names, column) - return [f"{name}__{f}" - for f in - trans.get_feature_names_out(input_features=column)] + return [ + f"{name}__{f}" + for f in trans.get_feature_names_out(input_features=column) + ] + return self._get_feature_names_out(get_names) def _update_fitted_transformers(self, transformers): diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index b31297d8f3cf7..9bc1fd69a8c05 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -771,11 +771,10 @@ def test_column_transformer_cloning(): # TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_feature_names_out"]) +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_column_transformer_get_feature_names(get_names): - X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T - ct = ColumnTransformer([('trans', Trans(), [0, 1])]) + X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T + ct = ColumnTransformer([("trans", Trans(), [0, 1])]) # raise correct error when not fitted with pytest.raises(NotFittedError): getattr(ct, get_names)() @@ -804,9 +803,8 @@ def test_column_transformer_get_feature_names(get_names): ) # TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_feature_names_out"]) -def test_column_transformer_get_feature_names(X, keys, get_names): +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) +def test_column_transformer_get_feature_names_pipeline(X, keys, get_names): ct = ColumnTransformer([("col" + str(i), DictVectorizer(), i) for i in range(2)]) ct.fit(X) assert getattr(ct, get_names)() == [f"col0__{key}" for key in keys[:2]] + [ @@ -852,8 +850,7 @@ def test_column_transformer_get_feature_names(X, keys, get_names): # TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_feature_names_out"]) +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_column_transformer_get_feature_names_dataframe(get_names): # passthough transformer with a dataframe pd = pytest.importorskip("pandas") @@ -864,44 +861,44 @@ def test_column_transformer_get_feature_names_dataframe(get_names): ct = ColumnTransformer([("trans", "passthrough", ["col0", "col1"])]) ct.fit(X_df) - assert getattr(ct, get_names)() == ['col0', 'col1'] + assert getattr(ct, get_names)() == ["col0", "col1"] ct = ColumnTransformer([("trans", "passthrough", [0, 1])]) ct.fit(X_df) - assert getattr(ct, get_names)() == ['col0', 'col1'] + assert getattr(ct, get_names)() == ["col0", "col1"] ct = ColumnTransformer([("col0", DictVectorizer(), 0)], remainder="passthrough") ct.fit(X_df) - assert getattr(ct, get_names)() == ['col0__a', 'col0__b', 'col1'] + assert getattr(ct, get_names)() == ["col0__a", "col0__b", "col1"] ct = ColumnTransformer( [("trans", "passthrough", ["col1"])], remainder="passthrough" ) ct.fit(X_df) - assert getattr(ct, get_names)() == ['col1', 'col0'] + assert getattr(ct, get_names)() == ["col1", "col0"] ct = ColumnTransformer( [("trans", "passthrough", lambda x: x[["col1"]].columns)], remainder="passthrough", ) ct.fit(X_df) - assert getattr(ct, get_names)() == ['col1', 'col0'] + assert getattr(ct, get_names)() == ["col1", "col0"] ct = ColumnTransformer( [("trans", "passthrough", np.array([False, True]))], remainder="passthrough" ) ct.fit(X_df) - assert getattr(ct, get_names)() == ['col1', 'col0'] + assert getattr(ct, get_names)() == ["col1", "col0"] ct = ColumnTransformer( [("trans", "passthrough", slice(1, 2))], remainder="passthrough" ) ct.fit(X_df) - assert getattr(ct, get_names)() == ['col1', 'col0'] + assert getattr(ct, get_names)() == ["col1", "col0"] ct = ColumnTransformer([("trans", "passthrough", [1])], remainder="passthrough") ct.fit(X_df) - assert getattr(ct, get_names)() == ['col1', 'col0'] + assert getattr(ct, get_names)() == ["col1", "col0"] def test_column_transformer_special_strings(): @@ -1452,12 +1449,15 @@ def test_make_column_selector_pickle(): [[], np.array([], dtype=int), lambda x: []], ids=["list", "array", "callable"], ) -@pytest.mark.parametrize("get_names, expected_names", [ - ("get_feature_names", ['ohe__x0_a', 'ohe__x0_b', 'ohe__x1_z']), - ("get_feature_names_out", ['ohe__col1_a', 'ohe__col1_b', 'ohe__col2_z']) -]) +@pytest.mark.parametrize( + "get_names, expected_names", + [ + ("get_feature_names", ["ohe__x0_a", "ohe__x0_b", "ohe__x1_z"]), + ("get_feature_names_out", ["ohe__col1_a", "ohe__col1_b", "ohe__col2_z"]), + ], +) def test_feature_names_empty_columns(empty_col, get_names, expected_names): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) @@ -1472,13 +1472,20 @@ def test_feature_names_empty_columns(empty_col, get_names, expected_names): assert getattr(ct, get_names)() == expected_names -@pytest.mark.parametrize("selector", [ - [1], lambda x: [1], ["col2"], lambda x: ["col2"], - [False, True], lambda x: [False, True] -]) +@pytest.mark.parametrize( + "selector", + [ + [1], + lambda x: [1], + ["col2"], + lambda x: ["col2"], + [False, True], + lambda x: [False, True], + ], +) def test_feature_names_out_pandas(selector): # checks name when selecting only the second column - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)]) ct.fit(df) @@ -1486,10 +1493,9 @@ def test_feature_names_out_pandas(selector): assert ct.get_feature_names_out() == ["ohe__col2_z"] -@pytest.mark.parametrize("selector", [ - [1], lambda x: [1], - [False, True], lambda x: [False, True] -]) +@pytest.mark.parametrize( + "selector", [[1], lambda x: [1], [False, True], lambda x: [False, True]] +) def test_feature_names_out_non_pandas(selector): # checks name when selecting the second column with numpy array X = [["a", "z"], ["a", "z"], ["b", "z"]] @@ -1563,10 +1569,10 @@ def test_sk_visual_block_remainder_fitted_numpy(remainder): # TODO: Remove in 1.2 when get_feature_names is removed def test_column_transformers_get_feature_names_deprecated(): X = np.array([[0, 1], [2, 4]]) - ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) + ct = ColumnTransformer([("trans", "passthrough", [0, 1])]) ct.fit(X) - msg = "get_feature_names is deprecated in 0.24" + msg = "get_feature_names is deprecated in 1.0" with pytest.warns(FutureWarning, match=msg): ct.get_feature_names() diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index a328dc77b3b47..94e5eb5095721 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -12,6 +12,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array, tosequence +from ..utils.deprecation import deprecated def _tosequence(X): @@ -371,8 +372,8 @@ def transform(self, X): return self._transform(X, fitting=False) @deprecated( - "get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_feature_names_out instead" + "get_feature_names is deprecated in 1.0 and will be removed " + "in 1.2. You can use get_feature_names_out instead" ) def get_feature_names(self): """Returns a list of feature names, ordered by their indices. diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 5dc328d51010b..be74777aaa9dc 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -185,7 +185,7 @@ def test_feature_union_get_feature_names_deprecated(): D_in = [{"version": "1", "ham": 2}, {"version": "2", "spam": 0.3}] v = DictVectorizer().fit(D_in) - msg = "get_feature_names is deprecated in 0.24" + msg = "get_feature_names is deprecated in 1.0" with pytest.warns(FutureWarning, match=msg): v.get_feature_names() diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index b87e2fb2324f8..3225f6994e61b 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -647,8 +647,7 @@ def test_hashing_vectorizer(): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_feature_names_out"]) +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_feature_names(get_names): cv = CountVectorizer(max_df=0.5) @@ -739,8 +738,7 @@ def test_vectorizer_max_features(Vectorizer): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_feature_names_out"]) +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_count_vectorizer_max_features(get_names): # Regression test: max_features didn't work correctly in 0.14. @@ -815,14 +813,13 @@ def test_vectorizer_min_df(): # TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_feature_names_out"]) +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_count_binary_occurrences(get_names): # by default multiple occurrences are counted as longs test_data = ["aaabc", "abbde"] vect = CountVectorizer(analyzer="char", max_df=1.0) X = vect.fit_transform(test_data).toarray() - assert_array_equal(['a', 'b', 'c', 'd', 'e'], getattr(vect, get_names)()) + assert_array_equal(["a", "b", "c", "d", "e"], getattr(vect, get_names)()) assert_array_equal([[3, 1, 1, 0, 0], [1, 2, 0, 1, 1]], X) # using boolean features, we can fetch the binary occurrence info @@ -1072,8 +1069,7 @@ def test_pickling_built_processors(factory): # TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_feature_names_out"]) +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_countvectorizer_vocab_sets_when_pickling(get_names): # ensure that vocabulary of type set is coerced to a list to # preserve iteration ordering after deserialization @@ -1102,8 +1098,7 @@ def test_countvectorizer_vocab_sets_when_pickling(get_names): # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_feature_names_out"]) +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_countvectorizer_vocab_dicts_when_pickling(get_names): rng = np.random.RandomState(0) vocab_words = np.array( @@ -1606,7 +1601,7 @@ def test_tie_breaking_sample_order_invariance(): # TODO: Remove in 1.2 when get_feature_names is removed def test_get_feature_names_deprecated(): cv = CountVectorizer(max_df=0.5).fit(ALL_FOOD_DOCS) - msg = "get_feature_names is deprecated in 0.24" + msg = "get_feature_names is deprecated in 1.0" with pytest.warns(FutureWarning, match=msg): cv.get_feature_names() diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 07cabd7559729..7865d263184da 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1374,8 +1374,10 @@ def inverse_transform(self, X): for i in range(n_samples) ] - @deprecated("get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_feature_names_out instead") + @deprecated( + "get_feature_names is deprecated in 1.0 and will be removed " + "in 1.2. You can use get_feature_names_out instead" + ) def get_feature_names(self): """Array mapping from feature integer indices to feature name. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index fdfdf8d267986..c71dfa533e109 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -682,9 +682,11 @@ def get_feature_names_out(self, input_features=None): raise TypeError( "Estimator {} does provide get_feature_names_out. " "Did you mean to call Pipeline[:-1].get_feature_names_out" - "()?".format(name)) + "()?".format(name) + ) feature_names = transform.get_feature_names_out( - input_features=feature_names) + input_features=feature_names + ) return feature_names @property @@ -973,8 +975,10 @@ def _iter(self): if trans != "drop" ) - @deprecated("get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_feature_names_out instead") + @deprecated( + "get_feature_names is deprecated in 1.0 and will be removed " + "in 1.2. You can use get_feature_names_out instead" + ) def get_feature_names(self): """Get feature names from all transformers. @@ -1008,13 +1012,14 @@ def get_feature_names_out(self, input_features=None): """ feature_names = [] for name, trans, _ in self._iter(): - if not hasattr(trans, 'get_feature_names_out'): - raise AttributeError("Transformer %s (type %s) does not " - "provide get_feature_names_out." - % (str(name), type(trans).__name__)) + if not hasattr(trans, "get_feature_names_out"): + raise AttributeError( + "Transformer %s (type %s) does not provide get_feature_names_out." + % (str(name), type(trans).__name__) + ) feature_names.extend( - [name + "__" + f for f in - trans.get_feature_names_out(input_features)]) + [name + "__" + f for f in trans.get_feature_names_out(input_features)] + ) return feature_names def fit(self, X, y=None, **fit_params): diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index c1877dff97e46..bc2d3f36bcd98 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2241,8 +2241,9 @@ def get_feature_names_out(self, input_features=None): output_feature_names : list of str Feature names for transformer output. """ - return _make_feature_names(n_features=self.K_fit_rows_.shape[0], - prefix=type(self).__name__.lower()) + return _make_feature_names( + n_features=self.K_fit_rows_.shape[0], prefix=type(self).__name__.lower() + ) def _more_tags(self): return {"pairwise": True} diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index a499493d1a81c..0c54f59cbe85c 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -655,8 +655,10 @@ def inverse_transform(self, X): return X_tr - @deprecated("get_feature_names is deprecated in 0.24 and will be removed " - "in 0.26. You can use get_feature_names_out instead") + @deprecated( + "get_feature_names is deprecated in 1.0 and will be removed " + "in 1.2. You can use get_feature_names_out instead" + ) def get_feature_names(self, input_features=None): """ Return feature names for output features. diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index b7c53c286b493..3ab32338bf510 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -15,6 +15,7 @@ from ..utils import check_array from ..utils.deprecation import deprecated from ..utils.fixes import linspace +from ..utils._feature_names import _make_feature_names from ..utils.validation import check_is_fitted, FLOAT_DTYPES from ._csr_polynomial_expansion import _csr_polynomial_expansion @@ -185,6 +186,10 @@ def powers_(self): [np.bincount(c, minlength=self.n_features_in_) for c in combinations] ) + @deprecated( + "get_feature_names is deprecated in 1.0 and will be removed " + "in 1.2. You can use get_feature_names_out instead" + ) def get_feature_names(self, input_features=None): """ Return feature names for output features @@ -199,9 +204,26 @@ def get_feature_names(self, input_features=None): ------- output_feature_names : list of str of shape (n_output_features,) """ + return self.get_feature_names_out(input_features) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. If None, they are generated as + x0, x1, ..., xn_features. + + Returns + ------- + feature_names : array-like of str + Transformed feature names. + """ powers = self.powers_ - if input_features is None: - input_features = ["x%d" % i for i in range(powers.shape[1])] + input_features = _make_feature_names( + n_features=powers.shape[1], input_features=input_features + ) feature_names = [] for row in powers: inds = np.where(row)[0] @@ -610,8 +632,13 @@ def _get_base_knot_positions(X, n_knots=10, knots="uniform"): return knots + @deprecated( + "get_feature_names is deprecated in 1.0 and will be removed " + "in 1.2. You can use get_feature_names_out instead" + ) def get_feature_names(self, input_features=None): - """Return feature names for output features. + """ + Return feature names for output features Parameters ---------- @@ -623,6 +650,22 @@ def get_feature_names(self, input_features=None): ------- output_feature_names : list of str of shape (n_output_features,) """ + return self.get_feature_names_out(input_features) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. If None, they are generated as + x0, x1, ..., xn_features. + + Returns + ------- + feature_names : array-like of str + Transformed feature names. + """ n_splines = self.bsplines_[0].c.shape[0] if input_features is None: input_features = ["x%d" % i for i in range(self.n_features_in_)] diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 78c76b06a0b5c..1b57ff644aa3c 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -961,7 +961,7 @@ def test_one_hot_encoder_get_feature_names_deprecated(): X = np.array([["cat", "dot"]], dtype=object).T enc = OneHotEncoder().fit(X) - msg = "get_feature_names is deprecated in 0.24" + msg = "get_feature_names is deprecated in 1.0" with pytest.warns(FutureWarning, match=msg): enc.get_feature_names() diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index f71bcdce19046..77a70c7d9a595 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -573,7 +573,10 @@ def test_polynomial_features_two_features( assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_) -def test_polynomial_feature_names(): +# TODO: Remove in 0.26 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) +def test_polynomial_feature_names(get_names): X = np.arange(30).reshape(10, 3) poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) feature_names = poly.get_feature_names() @@ -584,7 +587,7 @@ def test_polynomial_feature_names(): assert len(feature_names) == poly.transform(X).shape[1] poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) - feature_names = poly.get_feature_names(["a", "b", "c"]) + feature_names = getattr(poly, get_names)(["a", "b", "c"]) assert_array_equal( [ "a", @@ -612,7 +615,7 @@ def test_polynomial_feature_names(): assert len(feature_names) == poly.transform(X).shape[1] poly = PolynomialFeatures(degree=(2, 3), include_bias=False).fit(X) - feature_names = poly.get_feature_names(["a", "b", "c"]) + feature_names = getattr(poly, get_names)(["a", "b", "c"]) assert_array_equal( [ "a^2", @@ -639,7 +642,7 @@ def test_polynomial_feature_names(): poly = PolynomialFeatures( degree=(3, 3), include_bias=True, interaction_only=True ).fit(X) - feature_names = poly.get_feature_names(["a", "b", "c"]) + feature_names = getattr(poly, get_names)(["a", "b", "c"]) assert_array_equal(["1", "a b c"], feature_names) assert len(feature_names) == poly.transform(X).shape[1] @@ -855,3 +858,13 @@ def test_polynomial_features_deprecated_n_input_features(): with pytest.warns(FutureWarning, match=depr_msg): PolynomialFeatures().fit(X).n_input_features_ + + +# TODO: Remove in 1.2 when get_feature_names is removed +@pytest.mark.parametrize("Transformer", [SplineTransformer, PolynomialFeatures]) +def test_get_feature_names_deprecated(Transformer): + X = np.arange(30).reshape(10, 3) + poly = Transformer().fit(X) + msg = "get_feature_names is deprecated in 1.0" + with pytest.warns(FutureWarning, match=msg): + poly.get_feature_names() diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index f7a8eb09e2f38..e3f1327a3e766 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -38,7 +38,6 @@ from sklearn.impute import SimpleImputer from sklearn.feature_extraction.text import CountVectorizer from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.impute import SimpleImputer iris = load_iris() @@ -883,8 +882,7 @@ def test_feature_union_parallel(): # TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_feature_names_out"]) +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_feature_union_feature_names(get_names): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) @@ -920,69 +918,67 @@ def test_classes_property(): # TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_feature_names_out"]) +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_set_feature_union_steps(get_names): mult2 = Mult(2) mult3 = Mult(3) mult5 = Mult(5) if get_names == "get_feature_names": - mult3.get_feature_names = lambda: ['x3'] - mult2.get_feature_names = lambda: ['x2'] - mult5.get_feature_names = lambda: ['x5'] + mult3.get_feature_names = lambda: ["x3"] + mult2.get_feature_names = lambda: ["x2"] + mult5.get_feature_names = lambda: ["x5"] else: # get_feature_names_out - mult3.get_feature_names_out = lambda input_features: ['x3'] - mult2.get_feature_names_out = lambda input_features: ['x2'] - mult5.get_feature_names_out = lambda input_features: ['x5'] + mult3.get_feature_names_out = lambda input_features: ["x3"] + mult2.get_feature_names_out = lambda input_features: ["x2"] + mult5.get_feature_names_out = lambda input_features: ["x5"] ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) - assert ['m2__x2', 'm3__x3'] == getattr(ft, get_names)() + assert ["m2__x2", "m3__x3"] == getattr(ft, get_names)() # Directly setting attr ft.transformer_list = [("m5", mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) - assert ['m5__x5'] == getattr(ft, get_names)() + assert ["m5__x5"] == getattr(ft, get_names)() # Using set_params ft.set_params(transformer_list=[("mock", mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) - assert ['mock__x3'] == getattr(ft, get_names)() + assert ["mock__x3"] == getattr(ft, get_names)() # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) - assert ['mock__x5'] == getattr(ft, get_names)() + assert ["mock__x5"] == getattr(ft, get_names)() # TODO: Remove in 0.26 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") -@pytest.mark.parametrize("get_names", ["get_feature_names", - "get_feature_names_out"]) +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_set_feature_union_step_drop(get_names): mult2 = Mult(2) mult3 = Mult(3) if get_names == "get_feature_names": - mult2.get_feature_names = lambda: ['x2'] - mult3.get_feature_names = lambda: ['x3'] + mult2.get_feature_names = lambda: ["x2"] + mult3.get_feature_names = lambda: ["x3"] else: # get_feature_names_out - mult2.get_feature_names_out = lambda input_features: ['x2'] - mult3.get_feature_names_out = lambda input_features: ['x3'] + mult2.get_feature_names_out = lambda input_features: ["x2"] + mult3.get_feature_names_out = lambda input_features: ["x3"] X = np.asarray([[1]]) ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) - assert ['m2__x2', 'm3__x3'] == getattr(ft, get_names)() + assert ["m2__x2", "m3__x3"] == getattr(ft, get_names)() with pytest.warns(None) as record: ft.set_params(m2="drop") assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) - assert ['m3__x3'] == getattr(ft, get_names)() + assert ["m3__x3"] == getattr(ft, get_names)() assert not record with pytest.warns(None) as record: @@ -1003,7 +999,7 @@ def test_set_feature_union_step_drop(get_names): ft = FeatureUnion([("m2", "drop"), ("m3", mult3)]) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) - assert ['m3__x3'] == getattr(ft, get_names)() + assert ["m3__x3"] == getattr(ft, get_names)() assert not record @@ -1175,29 +1171,35 @@ def test_make_pipeline_memory(): def test_features_names_passthrough(): - pipe = Pipeline(steps=[ - ('imputer', 'passthrough'), - ('scaler', StandardScaler()), - ('select', 'passthrough'), - ('clf', LogisticRegression())]) + pipe = Pipeline( + steps=[ + ("imputer", "passthrough"), + ("scaler", StandardScaler()), + ("select", "passthrough"), + ("clf", LogisticRegression()), + ] + ) iris = load_iris() pipe.fit(iris.data, iris.target) - xs = ['x0', 'x1', 'x2', 'x3'] + xs = ["x0", "x1", "x2", "x3"] assert_array_equal(pipe[:-1].get_feature_names_out(), xs) - assert_array_equal(pipe[:-1].get_feature_names_out(iris.feature_names), - iris.feature_names) + assert_array_equal( + pipe[:-1].get_feature_names_out(iris.feature_names), iris.feature_names + ) def test_feature_names_count_vectorizer(): - pipe = Pipeline(steps=[ - ('vect', CountVectorizer()), - ('clf', LogisticRegression())]) + pipe = Pipeline(steps=[("vect", CountVectorizer()), ("clf", LogisticRegression())]) y = ["pizza" in x for x in JUNK_FOOD_DOCS] pipe.fit(JUNK_FOOD_DOCS, y) - assert_array_equal(pipe[:-1].get_feature_names_out(), - ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) - assert_array_equal(pipe[:-1].get_feature_names_out("nonsense_is_ignored"), - ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) + assert_array_equal( + pipe[:-1].get_feature_names_out(), + ["beer", "burger", "coke", "copyright", "pizza", "the"], + ) + assert_array_equal( + pipe[:-1].get_feature_names_out("nonsense_is_ignored"), + ["beer", "burger", "coke", "copyright", "pizza", "the"], + ) def test_pipeline_param_error(): @@ -1383,11 +1385,11 @@ def test_feature_union_warns_unknown_transformer_weight(): # TODO: Remove in 1.2 when get_feature_names is removed def test_feature_union_get_feature_names_deprecated(): - msg = "get_feature_names is deprecated in 0.24" + msg = "get_feature_names is deprecated in 1.0" mult2 = Mult(2) - mult2.get_feature_names = lambda: ['x2'] + mult2.get_feature_names = lambda: ["x2"] - ft = FeatureUnion([('m2', mult2)]) + ft = FeatureUnion([("m2", mult2)]) with pytest.warns(FutureWarning, match=msg): ft.get_feature_names() diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py index e8738d458f58f..6cd4caff9648d 100644 --- a/sklearn/utils/_feature_names.py +++ b/sklearn/utils/_feature_names.py @@ -1,4 +1,4 @@ -def _make_feature_names(n_features, prefix='x', input_features=None): +def _make_feature_names(n_features, prefix="x", input_features=None): """Make feature name strings from n_features. Either returns input_feature names if it is not None, or creates diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 2e26122d36265..16b25d33b7394 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3437,14 +3437,18 @@ def check_estimator_get_tags_default_keys(name, estimator_orig): ) -def check_transformer_get_feature_names_out(name, transformer_orig, - strict_mode=True): +def check_transformer_get_feature_names_out(name, transformer_orig, strict_mode=True): tags = transformer_orig._get_tags() if "2darray" not in tags["X_types"] or tags["no_validation"]: return - X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], - random_state=0, n_features=2, cluster_std=0.1) + X, y = make_blobs( + n_samples=30, + centers=[[0, 0, 0], [1, 1, 1]], + random_state=0, + n_features=2, + cluster_std=0.1, + ) X = StandardScaler().fit_transform(X) X -= X.min() X = _pairwise_estimator_convert_X(X, transformer_orig) @@ -3461,14 +3465,14 @@ def check_transformer_get_feature_names_out(name, transformer_orig, X_pred = transformer.fit_transform(X, y=y_) - input_features = ['feature%d' % i for i in range(n_features)] + input_features = ["feature%d" % i for i in range(n_features)] feature_names = transformer.get_feature_names_out(input_features) assert feature_names is not None if isinstance(X_pred, tuple): - assert len(feature_names) == X_pred[0].shape[1], ( - f"Expected {X_pred[0].shape[1]} feature names, got " - f"{len(feature_names)}") + assert ( + len(feature_names) == X_pred[0].shape[1] + ), f"Expected {X_pred[0].shape[1]} feature names, got {len(feature_names)}" else: - assert len(feature_names) == X_pred.shape[1], ( - f"Expected {X_pred.shape[1]} feature names, got " - f"{len(feature_names)}") + assert ( + len(feature_names) == X_pred.shape[1] + ), f"Expected {X_pred.shape[1]} feature names, got {len(feature_names)}" diff --git a/sklearn/utils/tests/test_make_feature_names.py b/sklearn/utils/tests/test_make_feature_names.py index 852299349a57d..744aead028001 100644 --- a/sklearn/utils/tests/test_make_feature_names.py +++ b/sklearn/utils/tests/test_make_feature_names.py @@ -6,13 +6,13 @@ @pytest.mark.parametrize( "n_features, prefix, input_features, expected_names", [ - (3, 'x', None, ['x0', 'x1', 'x2']), - (4, 'x', ['cat', 'dog', 'snake'], ['cat', 'dog', 'snake']), - (4, 'pca', None, ['pca0', 'pca1', 'pca2', 'pca3']) - ]) -def test_make_feature_names(n_features, prefix, input_features, - expected_names): - feature_names = _make_feature_names(n_features=n_features, - prefix=prefix, - input_features=input_features) + (3, "x", None, ["x0", "x1", "x2"]), + (4, "x", ["cat", "dog", "snake"], ["cat", "dog", "snake"]), + (4, "pca", None, ["pca0", "pca1", "pca2", "pca3"]), + ], +) +def test_make_feature_names(n_features, prefix, input_features, expected_names): + feature_names = _make_feature_names( + n_features=n_features, prefix=prefix, input_features=input_features + ) assert_array_equal(expected_names, feature_names) From 9722c086446cb4a2dea735b333241ae649fee600 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Jun 2021 12:15:04 -0400 Subject: [PATCH 051/100] CLN Adjust diff --- sklearn/compose/tests/test_column_transformer.py | 2 +- sklearn/feature_extraction/tests/test_dict_vectorizer.py | 2 +- sklearn/feature_extraction/tests/test_text.py | 6 +++--- sklearn/preprocessing/tests/test_polynomial.py | 2 +- sklearn/tests/test_pipeline.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 9bc1fd69a8c05..6d24e60881437 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1442,7 +1442,7 @@ def test_make_column_selector_pickle(): assert_array_equal(selector(X_df), selector_picked(X_df)) -# TODO: Remove in 0.26 when get_feature_names is removed. +# TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize( "empty_col", diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index be74777aaa9dc..5eead2b0a3855 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -39,7 +39,7 @@ def test_dictvectorizer(sparse, dtype, sort, iterable): assert v.feature_names_ == sorted(v.feature_names_) -# TODO: Remove in 0.26 when get_feature_names is removed. +# TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_feature_selection(get_names): diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 3225f6994e61b..3c1ff2c6525cb 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -645,7 +645,7 @@ def test_hashing_vectorizer(): assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0) -# TODO: Remove in 0.26 when get_feature_names is removed. +# TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_feature_names(get_names): @@ -736,7 +736,7 @@ def test_vectorizer_max_features(Vectorizer): assert vectorizer.stop_words_ == expected_stop_words -# TODO: Remove in 0.26 when get_feature_names is removed. +# TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_count_vectorizer_max_features(get_names): @@ -1096,7 +1096,7 @@ def test_countvectorizer_vocab_sets_when_pickling(get_names): assert getattr(cv, get_names)() == getattr(unpickled_cv, get_names)() -# TODO: Remove in 0.26 when get_feature_names is removed. +# TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_countvectorizer_vocab_dicts_when_pickling(get_names): diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 77a70c7d9a595..47d5533442b1f 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -573,7 +573,7 @@ def test_polynomial_features_two_features( assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_) -# TODO: Remove in 0.26 when get_feature_names is removed. +# TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_polynomial_feature_names(get_names): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index e3f1327a3e766..f18fc8d38bd88 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -953,7 +953,7 @@ def test_set_feature_union_steps(get_names): assert ["mock__x5"] == getattr(ft, get_names)() -# TODO: Remove in 0.26 when get_feature_names is removed. +# TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_set_feature_union_step_drop(get_names): From ba3aca2de42a976d428b7a5107836ec743cbb81d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Jun 2021 13:10:14 -0400 Subject: [PATCH 052/100] CLN Stricter capturing --- sklearn/compose/_column_transformer.py | 4 ++-- sklearn/compose/tests/test_column_transformer.py | 12 ++++++------ .../feature_extraction/tests/test_dict_vectorizer.py | 6 +++--- sklearn/feature_extraction/tests/test_text.py | 10 +++++----- sklearn/preprocessing/tests/test_encoders.py | 8 ++++---- sklearn/preprocessing/tests/test_polynomial.py | 2 +- sklearn/tests/test_common.py | 4 ++-- sklearn/tests/test_pipeline.py | 8 ++++---- 8 files changed, 27 insertions(+), 27 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index cfec4532f1234..4f554dc68e762 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -434,8 +434,8 @@ def get_feature_names_out(self, input_features=None): Transformed feature names. """ check_is_fitted(self) - if hasattr(self, "_df_columns"): - input_names = self._df_columns + if hasattr(self, "_feature_names_in") and self._feature_names_in is not None: + input_names = self._feature_names_in else: input_names = _make_feature_names(self.n_features_in_) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 6d24e60881437..656cf666042a6 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -770,7 +770,7 @@ def test_column_transformer_cloning(): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_column_transformer_get_feature_names(get_names): X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T @@ -780,9 +780,9 @@ def test_column_transformer_get_feature_names(get_names): getattr(ct, get_names)() # raise correct error when no feature names are available ct.fit(X_array) - msg = f"Transformer trans \\(type Trans\\) does not provide {get_names}" + msg = re.escape(f"Transformer trans (type Trans) does not provide {get_names}") with pytest.raises(AttributeError, match=msg): - ct.get_feature_names() + getattr(ct, get_names)() @pytest.mark.parametrize( @@ -802,7 +802,7 @@ def test_column_transformer_get_feature_names(get_names): ], ) # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_column_transformer_get_feature_names_pipeline(X, keys, get_names): ct = ColumnTransformer([("col" + str(i), DictVectorizer(), i) for i in range(2)]) @@ -849,7 +849,7 @@ def test_column_transformer_get_feature_names_pipeline(X, keys, get_names): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_column_transformer_get_feature_names_dataframe(get_names): # passthough transformer with a dataframe @@ -1443,7 +1443,7 @@ def test_make_column_selector_pickle(): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize( "empty_col", [[], np.array([], dtype=int), lambda x: []], diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 5eead2b0a3855..2ac9a4406a20f 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -40,7 +40,7 @@ def test_dictvectorizer(sparse, dtype, sort, iterable): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_feature_selection(get_names): # make two feature dicts with two useful features and a bunch of useless @@ -58,7 +58,7 @@ def test_feature_selection(get_names): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_one_of_k(get_names): D_in = [ @@ -79,7 +79,7 @@ def test_one_of_k(get_names): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_iterable_value(get_names): D_names = ["ham", "spam", "version=1", "version=2", "version=3"] diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 3c1ff2c6525cb..aa8ea7eae17e8 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -646,7 +646,7 @@ def test_hashing_vectorizer(): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_feature_names(get_names): cv = CountVectorizer(max_df=0.5) @@ -737,7 +737,7 @@ def test_vectorizer_max_features(Vectorizer): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_count_vectorizer_max_features(get_names): # Regression test: max_features didn't work correctly in 0.14. @@ -812,7 +812,7 @@ def test_vectorizer_min_df(): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_count_binary_occurrences(get_names): # by default multiple occurrences are counted as longs @@ -1068,7 +1068,7 @@ def test_pickling_built_processors(factory): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_countvectorizer_vocab_sets_when_pickling(get_names): # ensure that vocabulary of type set is coerced to a list to @@ -1097,7 +1097,7 @@ def test_countvectorizer_vocab_sets_when_pickling(get_names): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_countvectorizer_vocab_dicts_when_pickling(get_names): rng = np.random.RandomState(0) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 1b57ff644aa3c..cb99f0e6c53ab 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -142,7 +142,7 @@ def test_one_hot_encoder_dtype_pandas(output_dtype): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_one_hot_encoder_feature_names(get_names): enc = OneHotEncoder() @@ -209,7 +209,7 @@ def test_one_hot_encoder_feature_names(get_names): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_one_hot_encoder_feature_names_unicode(get_names): enc = OneHotEncoder() @@ -368,7 +368,7 @@ def test_one_hot_encoder_inverse_if_binary(): # check that resetting drop option without refitting does not throw an error # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) @pytest.mark.parametrize("drop", ["if_binary", "first", None]) @pytest.mark.parametrize("reset_drop", ["if_binary", "first", None]) @@ -578,7 +578,7 @@ def test_one_hot_encoder_pandas(): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) @pytest.mark.parametrize( "drop, expected_names", diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 47d5533442b1f..5204840f1a9c3 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -574,7 +574,7 @@ def test_polynomial_features_two_features( # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_polynomial_feature_names(get_names): X = np.arange(30).reshape(10, 3) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index a5b9037f524ca..84dcda0a1a4f6 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -88,8 +88,8 @@ def test_get_check_estimator_ids(val, expected): assert _get_check_estimator_ids(val) == expected -def _tested_estimators(): - for name, Estimator in all_estimators(): +def _tested_estimators(type_filter=None): + for name, Estimator in all_estimators(type_filter=type_filter): try: estimator = _construct_instance(Estimator) except SkipTest: diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index f18fc8d38bd88..4a3c235442159 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -881,7 +881,7 @@ def test_feature_union_parallel(): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_feature_union_feature_names(get_names): word_vect = CountVectorizer(analyzer="word") @@ -897,7 +897,7 @@ def test_feature_union_feature_names(get_names): msg = re.escape(f"Transformer tr1 (type Transf) does not provide {get_names}") with pytest.raises(AttributeError, match=msg): - getattr(ft, get_names) + getattr(ft, get_names)() def test_classes_property(): @@ -917,7 +917,7 @@ def test_classes_property(): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_set_feature_union_steps(get_names): mult2 = Mult(2) @@ -954,7 +954,7 @@ def test_set_feature_union_steps(get_names): # TODO: Remove in 1.2 when get_feature_names is removed. -@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) def test_set_feature_union_step_drop(get_names): mult2 = Mult(2) From c78967a6e9cdfd442c8905e969101d579f93f99f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Jun 2021 13:50:44 -0400 Subject: [PATCH 053/100] DOC Adds whats new --- doc/whats_new/v1.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 9689cd8789a7a..ef82dd4bff720 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -130,6 +130,9 @@ Changelog - |API| `np.matrix` usage is deprecated in 1.0 and will raise a `TypeError` in 1.2. :pr:`20165` by `Thomas Fan`_. +- |API| :term:`get_feature_names_out` has been added to the transformer API + to get the names of the output features. :pr:`18444` by `Thomas Fan`_. + :mod:`sklearn.base` ................... From f022a1b38deae3cb29ea5bb6b70e2e4fd3a01790 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 30 Jun 2021 14:53:37 -0400 Subject: [PATCH 054/100] TST Fixes errosr --- sklearn/compose/tests/test_column_transformer.py | 7 +++++-- sklearn/feature_extraction/tests/test_text.py | 7 +++++-- sklearn/preprocessing/_data.py | 2 +- sklearn/preprocessing/_encoders.py | 5 ++--- sklearn/preprocessing/tests/test_encoders.py | 7 +++++-- sklearn/preprocessing/tests/test_polynomial.py | 9 ++++++--- 6 files changed, 24 insertions(+), 13 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 656cf666042a6..ced17c78f013f 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1649,11 +1649,14 @@ def test_feature_name_validation_missing_columns_drop_passthough(): assert_allclose(df_dropped_trans, df_fit_trans) +# TODO: Remove in 1.2 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) @pytest.mark.parametrize("selector", [[], [False, False]]) -def test_get_feature_names_empty_selection(selector): +def test_get_feature_names_empty_selection(selector, get_names): """Test that get_feature_names is only called for transformers that were selected. Non-regression test for #19550. """ ct = ColumnTransformer([("ohe", OneHotEncoder(drop="first"), selector)]) ct.fit([[1, 2], [3, 4]]) - assert ct.get_feature_names() == [] + assert getattr(ct, get_names)() == [] diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index aa8ea7eae17e8..1215a92969142 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -394,7 +394,10 @@ def test_fit_countvectorizer_twice(): assert X1.shape[1] != X2.shape[1] -def test_countvectorizer_custom_token_pattern(): +# TODO: Remove in 1.2 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) +def test_countvectorizer_custom_token_pattern(get_names): """Check `get_feature_names()` when a custom token pattern is passed. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/12971 @@ -409,7 +412,7 @@ def test_countvectorizer_custom_token_pattern(): vectorizer = CountVectorizer(token_pattern=token_pattern) vectorizer.fit_transform(corpus) expected = ["document", "one", "sample"] - assert vectorizer.get_feature_names() == expected + assert getattr(vectorizer, get_names)() == expected def test_countvectorizer_custom_token_pattern_with_several_group(): diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index bc2d3f36bcd98..1e28945d02426 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -627,7 +627,7 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): class StandardScaler(OneToOneMixin, TransformerMixin, BaseEstimator): - """Standardize features by removing the mean and scaling to unit variance + """Standardize features by removing the mean and scaling to unit variance. The standard score of a sample `x` is calculated as: diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 0c54f59cbe85c..de7db5bdabb0d 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -657,11 +657,10 @@ def inverse_transform(self, X): @deprecated( "get_feature_names is deprecated in 1.0 and will be removed " - "in 1.2. You can use get_feature_names_out instead" + "in 1.2. You can use get_feature_names_out instead." ) def get_feature_names(self, input_features=None): - """ - Return feature names for output features. + """Return feature names for output features. Parameters ---------- diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index cb99f0e6c53ab..289ad9e012ce1 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -998,12 +998,15 @@ def test_encoders_string_categories(input_dtype, category_dtype, array_type): assert_array_equal(X_trans, expected) +# TODO: Remove in 1.2 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) @pytest.mark.parametrize("missing_value", [np.nan, None]) -def test_ohe_missing_values_get_feature_names(missing_value): +def test_ohe_missing_values_get_feature_names(get_names, missing_value): # encoder with missing values with object dtypes X = np.array([["a", "b", missing_value, "a", missing_value]], dtype=object).T ohe = OneHotEncoder(sparse=False, handle_unknown="ignore").fit(X) - names = ohe.get_feature_names() + names = getattr(ohe, get_names)() assert_array_equal(names, ["x0_a", "x0_b", f"x0_{missing_value}"]) diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 5204840f1a9c3..bad8de7b333b0 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -113,11 +113,14 @@ def test_spline_transformer_integer_knots(extrapolation): ).fit_transform(X) -def test_spline_transformer_feature_names(): +# TODO: Remove in 1.2 when get_feature_names is removed. +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) +def test_spline_transformer_feature_names(get_names): """Test that SplineTransformer generates correct features name.""" X = np.arange(20).reshape(10, 2) splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X) - feature_names = splt.get_feature_names() + feature_names = getattr(splt, get_names)() assert_array_equal( feature_names, [ @@ -135,7 +138,7 @@ def test_spline_transformer_feature_names(): ) splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X) - feature_names = splt.get_feature_names(["a", "b"]) + feature_names = getattr(splt, get_names)(["a", "b"]) assert_array_equal( feature_names, [ From f10da1075c127fb8c4c1de2df1ea1cb92a22af06 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 1 Jul 2021 13:02:42 -0400 Subject: [PATCH 055/100] CLN Address comments --- doc/modules/compose.rst | 6 +++--- sklearn/base.py | 10 +++++----- sklearn/compose/_column_transformer.py | 9 ++++----- sklearn/feature_selection/_base.py | 6 +++--- sklearn/preprocessing/_data.py | 18 +++++++++--------- sklearn/preprocessing/_encoders.py | 4 ++-- sklearn/utils/_feature_names.py | 2 +- 7 files changed, 27 insertions(+), 28 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 52f610652b8a3..8a851815fc66f 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -139,9 +139,9 @@ or by name:: >>> pipe['reduce_dim'] PCA() -To enable model inspection, `Pipeline` has a ``get_feature_names_out()`` method, -just like all transformers. You can use pipeline slicing to get the feature names -going into each step:: +To enable model inspection, :class:`~sklearn.pipeline.Pipeline` has a +``get_feature_names_out()`` method, just like all transformers. You can use +pipeline slicing to get the feature names going into each step:: >>> from sklearn.datasets import load_iris >>> from sklearn.feature_selection import SelectKBest diff --git a/sklearn/base.py b/sklearn/base.py index 85e262fa76d9d..3bb16e6fd0a8f 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -805,8 +805,8 @@ def fit_predict(self, X, y=None): return self.fit(X).predict(X) -class OneToOneMixin: - """Provides get_feature_names_out for simple transformers +class OneToOneFeatureMixin: + """Provides `get_feature_names_out` for simple transformers. Assumes there's a 1-to-1 correspondence between input features and output features. @@ -815,14 +815,14 @@ class OneToOneMixin: def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. - Returns input_features as this transformation - doesn't add or drop features. + Returns `input_features` as this transformation doesn't add or drop + features. Parameters ---------- input_features : array-like of str or None, default=None Input features. If None, they are generated as - x0, x1, ..., xn_features. + `[x0, x1, ..., xn_features]`. Returns ------- diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 4f554dc68e762..5eeb56158b1ec 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -373,10 +373,9 @@ def named_transformers_(self): return Bunch(**{name: trans for name, trans, _ in self.transformers_}) def _get_feature_names_out(self, get_names): - """Private function to be used by get_feature_names_out and - get_feature_names. This should be removed and integrated into - get_feature_names_out when get_feature_names is deprecated. - """ + """Private function to be used by get_feature_names*.""" + # TODO(1.2): This should be removed and integrated into + # get_feature_names_out when get_feature_names is deprecated. feature_names = [] for name, trans, column, _ in self._iter(fitted=True): if trans == "drop" or _is_empty_column_selection(column): @@ -405,7 +404,7 @@ def get_feature_names(self): Returns ------- - feature_names : list of strings + feature_names : list of str Names of the features produced by transform. """ check_is_fitted(self) diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 344106398c361..2edb2df9d9de1 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -145,9 +145,9 @@ def get_feature_names_out(self, input_features=None): Parameters ---------- - input_features : list of str or None, default=None - Input features to select from. If None, they are generated as - x0, x1, ..., xn. + input_features : array-like of str or None, default=None + Input features. If None, they are generated as + `[x0, x1, ..., xn_features]`. Returns ------- diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 1e28945d02426..981e43f45a7ea 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -16,7 +16,7 @@ from scipy import optimize from scipy.special import boxcox -from ..base import BaseEstimator, OneToOneMixin, TransformerMixin +from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin from ..utils import check_array from ..utils.deprecation import deprecated from ..utils.extmath import row_norms @@ -263,7 +263,7 @@ def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): return X -class MinMaxScaler(OneToOneMixin, TransformerMixin, BaseEstimator): +class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Transform features by scaling each feature to a given range. This estimator scales and translates each feature individually such @@ -626,7 +626,7 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): return X -class StandardScaler(OneToOneMixin, TransformerMixin, BaseEstimator): +class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Standardize features by removing the mean and scaling to unit variance. The standard score of a sample `x` is calculated as: @@ -1035,7 +1035,7 @@ def _more_tags(self): return {"allow_nan": True, "preserves_dtype": [np.float64, np.float32]} -class MaxAbsScaler(OneToOneMixin, TransformerMixin, BaseEstimator): +class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Scale each feature by its maximum absolute value. This estimator scales and translates each feature individually such @@ -1331,7 +1331,7 @@ def maxabs_scale(X, *, axis=0, copy=True): return X -class RobustScaler(OneToOneMixin, TransformerMixin, BaseEstimator): +class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to @@ -1818,7 +1818,7 @@ def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False): return X -class Normalizer(OneToOneMixin, TransformerMixin, BaseEstimator): +class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one @@ -1983,7 +1983,7 @@ def binarize(X, *, threshold=0.0, copy=True): return X -class Binarizer(OneToOneMixin, TransformerMixin, BaseEstimator): +class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Binarize data (set feature values to 0 or 1) according to a threshold. Values greater than the threshold map to 1, while values less than @@ -2316,7 +2316,7 @@ def add_dummy_feature(X, value=1.0): return np.hstack((np.full((n_samples, 1), value), X)) -class QuantileTransformer(OneToOneMixin, TransformerMixin, BaseEstimator): +class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Transform features using quantiles information. This method transforms the features to follow a uniform or a normal @@ -2895,7 +2895,7 @@ def quantile_transform( ) -class PowerTransformer(OneToOneMixin, TransformerMixin, BaseEstimator): +class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Apply a power transform featurewise to make data more Gaussian-like. Power transforms are a family of parametric, monotonic transformations diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index de7db5bdabb0d..efefca1619c9b 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -679,8 +679,8 @@ def get_feature_names(self, input_features=None): def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. - Returns input_features as this transformation - doesn't add or drop features. + Returns `input_features` as this transformation doesn't add or drop + features. Parameters ---------- diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py index 6cd4caff9648d..163c59d523f28 100644 --- a/sklearn/utils/_feature_names.py +++ b/sklearn/utils/_feature_names.py @@ -8,7 +8,7 @@ def _make_feature_names(n_features, prefix="x", input_features=None): Parameters ---------- n_features : int - Number of feature names to generate + Number of feature names to generate. prefix : str, default='x' Prefix for each feature name. input_features : array-like of str From d8bafb3a2a6158d307359a396b7b84b764311609 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 1 Jul 2021 13:02:59 -0400 Subject: [PATCH 056/100] TST Increases test coverage --- sklearn/tests/test_pipeline.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 4a3c235442159..b0b347e1e4e47 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1202,6 +1202,18 @@ def test_feature_names_count_vectorizer(): ) +def test_pipeline_feature_names_out_trans_no_get_feature_names_out(): + """Check that error is raised when a tarnsformer does not define + `get_feature_names_out`.""" + pipe = Pipeline(steps=[("notrans", NoTrans())]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + + msg = "does not provide get_feature_names_out" + with pytest.raises(AttributeError, match=msg): + pipe.get_feature_names_out() + + def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises( From 87512967bd4d96d08dfd26055e3de4cf5fcebad5 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 1 Jul 2021 13:07:57 -0400 Subject: [PATCH 057/100] DOC More docstrings --- sklearn/compose/tests/test_column_transformer.py | 5 +++-- sklearn/feature_extraction/tests/test_dict_vectorizer.py | 1 + sklearn/tests/test_pipeline.py | 5 ++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index ced17c78f013f..14407fd2196c4 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1484,7 +1484,7 @@ def test_feature_names_empty_columns(empty_col, get_names, expected_names): ], ) def test_feature_names_out_pandas(selector): - # checks name when selecting only the second column + """Checks name when selecting only the second column""" pd = pytest.importorskip("pandas") df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)]) @@ -1497,7 +1497,7 @@ def test_feature_names_out_pandas(selector): "selector", [[1], lambda x: [1], [False, True], lambda x: [False, True]] ) def test_feature_names_out_non_pandas(selector): - # checks name when selecting the second column with numpy array + """Checks name when selecting the second column with numpy array""" X = [["a", "z"], ["a", "z"], ["b", "z"]] ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)]) ct.fit(X) @@ -1568,6 +1568,7 @@ def test_sk_visual_block_remainder_fitted_numpy(remainder): # TODO: Remove in 1.2 when get_feature_names is removed def test_column_transformers_get_feature_names_deprecated(): + """Check that get_feature_names is deprecated""" X = np.array([[0, 1], [2, 4]]) ct = ColumnTransformer([("trans", "passthrough", [0, 1])]) ct.fit(X) diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 2ac9a4406a20f..64973aa62d1f5 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -182,6 +182,7 @@ def test_n_features_in(): # TODO: Remove in 1.2 when get_feature_names is removed def test_feature_union_get_feature_names_deprecated(): + """Check that get_feature_names is deprecated""" D_in = [{"version": "1", "ham": 2}, {"version": "2", "spam": 0.3}] v = DictVectorizer().fit(D_in) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index b0b347e1e4e47..e607b2b264980 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -35,9 +35,9 @@ from sklearn.decomposition import PCA, TruncatedSVD from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler -from sklearn.impute import SimpleImputer from sklearn.feature_extraction.text import CountVectorizer from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.impute import SimpleImputer iris = load_iris() @@ -1171,6 +1171,7 @@ def test_make_pipeline_memory(): def test_features_names_passthrough(): + """Check pipeline.get_feature_names_out with passthrough""" pipe = Pipeline( steps=[ ("imputer", "passthrough"), @@ -1189,6 +1190,7 @@ def test_features_names_passthrough(): def test_feature_names_count_vectorizer(): + """Check pipeline.get_feature_names_out with vectorizers""" pipe = Pipeline(steps=[("vect", CountVectorizer()), ("clf", LogisticRegression())]) y = ["pizza" in x for x in JUNK_FOOD_DOCS] pipe.fit(JUNK_FOOD_DOCS, y) @@ -1397,6 +1399,7 @@ def test_feature_union_warns_unknown_transformer_weight(): # TODO: Remove in 1.2 when get_feature_names is removed def test_feature_union_get_feature_names_deprecated(): + """Check that get_feature_names is deprecated""" msg = "get_feature_names is deprecated in 1.0" mult2 = Mult(2) mult2.get_feature_names = lambda: ["x2"] From 84dc208d8bc4dd61befc7c7ede2d3842b8e43000 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 9 Jul 2021 16:31:09 -0400 Subject: [PATCH 058/100] TST Fixes error message --- sklearn/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index c71dfa533e109..27a0c1389bb97 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -680,7 +680,7 @@ def get_feature_names_out(self, input_features=None): for _, name, transform in self._iter(): if not hasattr(transform, "get_feature_names_out"): raise TypeError( - "Estimator {} does provide get_feature_names_out. " + "Estimator {} does not provide get_feature_names_out. " "Did you mean to call Pipeline[:-1].get_feature_names_out" "()?".format(name) ) From 149c4e3120a8023f1092afd763959c77c0980882 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 17 Aug 2021 23:46:22 -0400 Subject: [PATCH 059/100] CLN Improves test --- sklearn/tests/test_common.py | 7 ++++++- sklearn/utils/estimator_checks.py | 10 ++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index c7d244d18fabe..50188946094c5 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -381,4 +381,9 @@ def test_pandas_column_name_consistency(estimator): @pytest.mark.parametrize("transformer", GET_FEATURES_OUT_ESTIMATORS) def test_transformers_get_feature_names_out(transformer): - check_transformer_get_feature_names_out(type(transformer).__name__, transformer) + _set_checking_parameters(transformer) + + with ignore_warnings(category=(FutureWarning)): + check_transformer_get_feature_names_out( + transformer.__class__.__name__, transformer + ) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index da66e93db6105..b6660c3150512 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3774,7 +3774,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): estimator.partial_fit(X_bad, y) -def check_transformer_get_feature_names_out(name, transformer_orig, strict_mode=True): +def check_transformer_get_feature_names_out(name, transformer_orig): tags = transformer_orig._get_tags() if "2darray" not in tags["X_types"] or tags["no_validation"]: return @@ -3788,11 +3788,12 @@ def check_transformer_get_feature_names_out(name, transformer_orig, strict_mode= ) X = StandardScaler().fit_transform(X) X -= X.min() - X = _pairwise_estimator_convert_X(X, transformer_orig) - n_samples, n_features = np.asarray(X).shape transformer = clone(transformer_orig) - _set_checking_parameters(transformer) + X = _enforce_estimator_tags_x(transformer, X) + X = _pairwise_estimator_convert_X(X, transformer) + + n_features = X.shape[1] set_random_state(transformer) y_ = y @@ -3805,6 +3806,7 @@ def check_transformer_get_feature_names_out(name, transformer_orig, strict_mode= input_features = ["feature%d" % i for i in range(n_features)] feature_names = transformer.get_feature_names_out(input_features) assert feature_names is not None + if isinstance(X_pred, tuple): assert ( len(feature_names) == X_pred[0].shape[1] From 628a2b3b52a39904de303bc54bb0590e58f91a99 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 23 Aug 2021 22:17:50 -0400 Subject: [PATCH 060/100] TST Fix exception type --- sklearn/pipeline.py | 2 +- sklearn/tests/test_pipeline.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 2a757bee3b3a8..04046ad88e083 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -687,7 +687,7 @@ def get_feature_names_out(self, input_features=None): feature_names = input_features for _, name, transform in self._iter(): if not hasattr(transform, "get_feature_names_out"): - raise TypeError( + raise AttributeError( "Estimator {} does not provide get_feature_names_out. " "Did you mean to call Pipeline[:-1].get_feature_names_out" "()?".format(name) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 4a9bc7b8f675e..978f96ca112b2 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1205,8 +1205,8 @@ def test_feature_names_count_vectorizer(): ) -def test_pipeline_feature_names_out_trans_no_get_feature_names_out(): - """Check that error is raised when a tarnsformer does not define +def test_pipeline_feature_names_out_error_without_definition(): + """Check that error is raised when a transformer does not define `get_feature_names_out`.""" pipe = Pipeline(steps=[("notrans", NoTrans())]) iris = load_iris() From faae55705a04edb28ae1917526bc59d39ec0ac02 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 28 Aug 2021 11:34:33 +0200 Subject: [PATCH 061/100] Fix remaining occurrence of _feature_names_in --- sklearn/compose/_column_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 0a1578b629b7e..84dfea8713eaa 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -433,8 +433,8 @@ def get_feature_names_out(self, input_features=None): Transformed feature names. """ check_is_fitted(self) - if hasattr(self, "_feature_names_in") and self._feature_names_in is not None: - input_names = self._feature_names_in + if hasattr(self, "feature_names_in_"): + input_names = self.feature_names_in_ else: input_names = _make_feature_names(self.n_features_in_) From 02a25be4f2b45795171822cf7bcd421c4224395f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 28 Aug 2021 12:02:30 +0200 Subject: [PATCH 062/100] cosmit --- doc/modules/compose.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 8a851815fc66f..d3d34bb17c0b5 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -447,7 +447,7 @@ By default, the remaining rating columns are ignored (``remainder='drop'``):: >>> from sklearn.feature_extraction.text import CountVectorizer >>> from sklearn.preprocessing import OneHotEncoder >>> column_trans = ColumnTransformer( - ... [('categories', OneHotEncoder(dtype='int'),['city']), + ... [('categories', OneHotEncoder(dtype='int'), ['city']), ... ('title_bow', CountVectorizer(), 'title')], ... remainder='drop') From 20ecd70eb6d3a09dd9a6335fd91ba89f27269641 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 28 Aug 2021 13:17:34 +0200 Subject: [PATCH 063/100] Attempt to fix numpydoc failure --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index ed7dee232b022..2cf78eebf54e7 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1389,7 +1389,7 @@ def inverse_transform(self, X): @deprecated( "get_feature_names is deprecated in 1.0 and will be removed " - "in 1.2. You can use get_feature_names_out instead" + "in 1.2. You can use get_feature_names_out instead." ) def get_feature_names(self): """Array mapping from feature integer indices to feature name. From c6bc0ce7cfeefce635d5582e457fca743d399608 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 28 Aug 2021 09:12:53 -0400 Subject: [PATCH 064/100] DOC Use ndarray of string --- sklearn/base.py | 2 +- sklearn/compose/_column_transformer.py | 2 +- sklearn/feature_extraction/_dict_vectorizer.py | 4 ++-- sklearn/feature_extraction/text.py | 4 ++-- sklearn/feature_selection/_base.py | 4 ++-- sklearn/pipeline.py | 6 +++--- sklearn/preprocessing/_data.py | 1 + sklearn/preprocessing/_discretization.py | 4 ++-- sklearn/preprocessing/_encoders.py | 2 +- sklearn/preprocessing/_polynomial.py | 4 ++-- 10 files changed, 17 insertions(+), 16 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 129d5f0ad84cc..09c5913f6f91f 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -918,7 +918,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names : array-like of str + feature_names_out : ndarray of str Transformed feature names. """ return _make_feature_names(self.n_features_in_, input_features=input_features) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 84dfea8713eaa..6628e6688de23 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -429,7 +429,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - output_feature_names : list of str + feature_names_out : ndarray of str Transformed feature names. """ check_is_fitted(self) diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index cb2dc0b657ab4..189428cef7a62 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -393,8 +393,8 @@ def get_feature_names_out(self, input_features=None): Returns ------- - output_feature_names : list of str - Feature names for transformer output. + feature_names_out : ndarray of str + Transformed feature names. """ return self.feature_names_ diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 2cf78eebf54e7..b6a0c0a479c76 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1411,8 +1411,8 @@ def get_feature_names_out(self, input_features=None): Returns ------- - output_feature_names : list of str - Feature names for transformer output. + feature_names_out : ndarray of str + Transformed feature names. """ self._check_vocabulary() diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 43c983e894483..8953eeabfca93 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -150,8 +150,8 @@ def get_feature_names_out(self, input_features=None): Returns ------- - output_feature_names : list of str - Feature names for transformer output. + feature_names_out : ndarray of str + Transformed feature names. """ mask = self.get_support() input_features = _make_feature_names( diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 020a2e54e27a5..199c6f37e30db 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -687,8 +687,8 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names : array-like of string - Transformed feature names + feature_names_out : ndarray of str + Transformed feature names. """ feature_names = input_features for _, name, transform in self._iter(): @@ -1038,7 +1038,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - output_feature_names : list of str + feature_names_out : ndarray of str Transformed feature names. """ feature_names = [] diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 40b43868e797f..897010f6debb3 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2288,6 +2288,7 @@ def get_feature_names_out(self, input_features=None): ------- output_feature_names : list of str Feature names for transformer output. + """ return _make_feature_names( n_features=self.K_fit_rows_.shape[0], prefix=type(self).__name__.lower() diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 3dcc80b5d3fe8..0240ebe233215 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -388,7 +388,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - output_feature_names : list of str - Feature names for transformer output. + feature_names_out : ndarray of str + Transformed feature names. """ return self._encoder.get_feature_names_out(input_features) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 95fb0b5e3a251..2265462543749 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -696,7 +696,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names : list of str + feature_names_out : ndarray of str Transformed feature names. """ check_is_fitted(self) diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index ea8e5c2a8a315..2c9626ac513fc 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -225,7 +225,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names : array-like of str + feature_names_out : ndarray of str Transformed feature names. """ powers = self.powers_ @@ -689,7 +689,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names : array-like of str + feature_names_out : ndarray of str Transformed feature names. """ n_splines = self.bsplines_[0].c.shape[0] From b76fd410db421aef0ff14a9a262a5505dbd52db2 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 28 Aug 2021 09:13:51 -0400 Subject: [PATCH 065/100] DOC Update doc to use string --- sklearn/preprocessing/_data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 897010f6debb3..d869375b43a5c 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2286,9 +2286,8 @@ def get_feature_names_out(self, input_features=None): Returns ------- - output_feature_names : list of str - Feature names for transformer output. - + feature_names_out : ndarray of str + Transformed feature names. """ return _make_feature_names( n_features=self.K_fit_rows_.shape[0], prefix=type(self).__name__.lower() From d60c4fabf3ded9827dc6793526776503bcbc1196 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 28 Aug 2021 09:15:01 -0400 Subject: [PATCH 066/100] DOC More docstring fixes --- sklearn/preprocessing/_discretization.py | 2 +- sklearn/preprocessing/_encoders.py | 2 +- sklearn/preprocessing/_polynomial.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 0240ebe233215..754e2862201c2 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -384,7 +384,7 @@ def get_feature_names_out(self, input_features=None): ---------- input_features : array-like of str or None, default=None Input features. If None, they are generated as - x0, x1, ..., xn_features. + `[x0, x1, ..., xn_features]`. Returns ------- diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 2265462543749..b0265264c577e 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -692,7 +692,7 @@ def get_feature_names_out(self, input_features=None): ---------- input_features : array-like of str or None, default=None Input features. If None, they are generated as - x0, x1, ..., xn_features. + `[x0, x1, ..., xn_features]`. Returns ------- diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 2c9626ac513fc..e46339e656145 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -221,7 +221,7 @@ def get_feature_names_out(self, input_features=None): ---------- input_features : array-like of str or None, default=None Input features. If None, they are generated as - x0, x1, ..., xn_features. + `[x0, x1, ..., xn_features]`. Returns ------- @@ -685,7 +685,7 @@ def get_feature_names_out(self, input_features=None): ---------- input_features : array-like of str or None, default=None Input features. If None, they are generated as - x0, x1, ..., xn_features. + `[x0, x1, ..., xn_features]`. Returns ------- From 4a00562712b5a134049c4dc3ddebaf52693e73bc Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 28 Aug 2021 09:36:46 -0400 Subject: [PATCH 067/100] TST Adds failing test --- sklearn/tests/test_common.py | 4 +++- sklearn/utils/estimator_checks.py | 14 ++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index e7e6a6d3bef64..f6eee53314120 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -379,7 +379,9 @@ def test_pandas_column_name_consistency(estimator): ] -@pytest.mark.parametrize("transformer", GET_FEATURES_OUT_ESTIMATORS) +@pytest.mark.parametrize( + "transformer", GET_FEATURES_OUT_ESTIMATORS, ids=_get_check_estimator_ids +) def test_transformers_get_feature_names_out(transformer): _set_checking_parameters(transformer) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3015dd3ebe882..30641215e60c3 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3859,14 +3859,16 @@ def check_transformer_get_feature_names_out(name, transformer_orig): X_pred = transformer.fit_transform(X, y=y_) input_features = ["feature%d" % i for i in range(n_features)] - feature_names = transformer.get_feature_names_out(input_features) - assert feature_names is not None + feature_names_out = transformer.get_feature_names_out(input_features) + assert feature_names_out is not None + assert isinstance(feature_names_out, np.ndarray) + assert all(isinstance(name, str) for name in feature_names_out) if isinstance(X_pred, tuple): assert ( - len(feature_names) == X_pred[0].shape[1] - ), f"Expected {X_pred[0].shape[1]} feature names, got {len(feature_names)}" + len(feature_names_out) == X_pred[0].shape[1] + ), f"Expected {X_pred[0].shape[1]} feature names, got {len(feature_names_out)}" else: assert ( - len(feature_names) == X_pred.shape[1] - ), f"Expected {X_pred.shape[1]} feature names, got {len(feature_names)}" + len(feature_names_out) == X_pred.shape[1] + ), f"Expected {X_pred.shape[1]} feature names, got {len(feature_names_out)}" From d5b72de11921ad38c1970aa06d4f685a996ff517 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 28 Aug 2021 09:50:11 -0400 Subject: [PATCH 068/100] ENH Restrict to str and ndarrays --- sklearn/feature_selection/_base.py | 2 +- sklearn/preprocessing/_encoders.py | 2 +- sklearn/preprocessing/_polynomial.py | 8 ++++---- sklearn/utils/_feature_names.py | 9 ++++++--- sklearn/utils/tests/test_make_feature_names.py | 3 +++ 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 8953eeabfca93..a60161152a65f 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -157,7 +157,7 @@ def get_feature_names_out(self, input_features=None): input_features = _make_feature_names( mask.shape[0], input_features=input_features ) - return [feat for feat, m in zip(input_features, mask) if m] + return input_features[mask] def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1): diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index b0265264c577e..815f8478d8bea 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -717,7 +717,7 @@ def get_feature_names_out(self, input_features=None): if self.drop_idx_ is not None and self.drop_idx_[i] is not None: names.pop(self.drop_idx_[i]) feature_names.extend(names) - return feature_names + return np.asarray(feature_names) class OrdinalEncoder(_BaseEncoder): diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index e46339e656145..fda79852f9b72 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -212,7 +212,7 @@ def get_feature_names(self, input_features=None): ------- output_feature_names : list of str of shape (n_output_features,) """ - return self.get_feature_names_out(input_features) + return self.get_feature_names_out(input_features).tolist() def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. @@ -245,7 +245,7 @@ def get_feature_names_out(self, input_features=None): else: name = "1" feature_names.append(name) - return feature_names + return np.asarray(feature_names) def fit(self, X, y=None): """ @@ -676,7 +676,7 @@ def get_feature_names(self, input_features=None): ------- output_feature_names : list of str of shape (n_output_features,) """ - return self.get_feature_names_out(input_features) + return self.get_feature_names_out(input_features).tolist() def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. @@ -699,7 +699,7 @@ def get_feature_names_out(self, input_features=None): for i in range(self.n_features_in_): for j in range(n_splines - 1 + self.include_bias): feature_names.append(f"{input_features[i]}_sp_{j}") - return feature_names + return np.asarray(feature_names) def fit(self, X, y=None, sample_weight=None): """Compute knot positions of splines. diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py index 163c59d523f28..1f55b46cd69ee 100644 --- a/sklearn/utils/_feature_names.py +++ b/sklearn/utils/_feature_names.py @@ -1,3 +1,6 @@ +import numpy as np + + def _make_feature_names(n_features, prefix="x", input_features=None): """Make feature name strings from n_features. @@ -16,9 +19,9 @@ def _make_feature_names(n_features, prefix="x", input_features=None): Returns ------- - feature_names : array-like of str + feature_names : ndarray of str Generated feature names of length n_features. """ if input_features is not None: - return input_features - return [f"{prefix}{i}" for i in range(n_features)] + return np.asarray(input_features) + return np.array([f"{prefix}{i}" for i in range(n_features)]) diff --git a/sklearn/utils/tests/test_make_feature_names.py b/sklearn/utils/tests/test_make_feature_names.py index 744aead028001..99904ddbec138 100644 --- a/sklearn/utils/tests/test_make_feature_names.py +++ b/sklearn/utils/tests/test_make_feature_names.py @@ -1,4 +1,6 @@ import pytest + +import numpy as np from numpy.testing import assert_array_equal from sklearn.utils._feature_names import _make_feature_names @@ -15,4 +17,5 @@ def test_make_feature_names(n_features, prefix, input_features, expected_names): feature_names = _make_feature_names( n_features=n_features, prefix=prefix, input_features=input_features ) + assert isinstance(feature_names, np.ndarray) assert_array_equal(expected_names, feature_names) From a0b7446fdad52d83ba7e24c5fbf9ffb15c583fd7 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 28 Aug 2021 12:12:15 -0400 Subject: [PATCH 069/100] ENH Convert ints to strs in dictvectorizer --- sklearn/feature_extraction/_dict_vectorizer.py | 6 +++++- .../tests/test_dict_vectorizer.py | 16 ++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 189428cef7a62..78ddb6506dc96 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -396,7 +396,11 @@ def get_feature_names_out(self, input_features=None): feature_names_out : ndarray of str Transformed feature names. """ - return self.feature_names_ + if any(not isinstance(name, str) for name in self.feature_names_): + feature_names = [str(name) for name in self.feature_names_] + else: + feature_names = self.feature_names_ + return np.array(feature_names) def restrict(self, support, indices=False): """Restrict the features to those in support using feature selection. diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index fa1a570228c21..0e7b9513865ad 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -54,7 +54,7 @@ def test_feature_selection(get_names): sel = SelectKBest(chi2, k=2).fit(X, [0, 1]) v.restrict(sel.get_support(indices=indices), indices=indices) - assert getattr(v, get_names)() == ["useful1", "useful2"] + assert_array_equal(getattr(v, get_names)(), ["useful1", "useful2"]) # TODO: Remove in 1.2 when get_feature_names is removed. @@ -103,7 +103,7 @@ def test_iterable_value(get_names): names = getattr(v, get_names)() - assert names == D_names + assert_array_equal(names, D_names) def test_iterable_not_string_error(): @@ -247,3 +247,15 @@ class A: err_msg = "Unsupported value Type" with pytest.raises(TypeError, match=err_msg): vectorizer.fit_transform(X) + + +def test_dict_vectorizer_get_feature_names_out(): + """Check that integer feature names are converted to strings in + feature_names_out.""" + + X = [{1: 2, 3: 4}, {2: 4}] + dv = DictVectorizer(sparse=False).fit(X) + + feature_names = dv.get_feature_names_out() + assert isinstance(feature_names, np.ndarray) + assert_array_equal(feature_names, ["1", "2", "3"]) From d8f84b393c5ea4f91ba9962af0576079f568cec8 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 28 Aug 2021 12:52:31 -0400 Subject: [PATCH 070/100] ENH Uses feature_names_in_ in get_feature_names_out --- sklearn/base.py | 2 + sklearn/feature_selection/_base.py | 3 + sklearn/pipeline.py | 2 +- sklearn/preprocessing/_discretization.py | 2 + sklearn/preprocessing/_encoders.py | 7 ++- sklearn/preprocessing/_polynomial.py | 13 ++++- sklearn/tests/test_common.py | 4 ++ sklearn/utils/_feature_names.py | 2 +- sklearn/utils/estimator_checks.py | 70 +++++++++++++++++++++--- 9 files changed, 90 insertions(+), 15 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 09c5913f6f91f..6639d8f1e42ea 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -921,6 +921,8 @@ def get_feature_names_out(self, input_features=None): feature_names_out : ndarray of str Transformed feature names. """ + if input_features is None and hasattr(self, "feature_names_in_"): + input_features = self.feature_names_in_ return _make_feature_names(self.n_features_in_, input_features=input_features) diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index a60161152a65f..d01f660293501 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -153,6 +153,9 @@ def get_feature_names_out(self, input_features=None): feature_names_out : ndarray of str Transformed feature names. """ + if input_features is None and hasattr(self, "feature_names_in_"): + input_features = self.feature_names_in_ + mask = self.get_support() input_features = _make_feature_names( mask.shape[0], input_features=input_features diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 199c6f37e30db..f95dc39e07317 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -1051,7 +1051,7 @@ def get_feature_names_out(self, input_features=None): feature_names.extend( [name + "__" + f for f in trans.get_feature_names_out(input_features)] ) - return feature_names + return np.asarray(feature_names) def fit(self, X, y=None, **fit_params): """Fit all transformers using X. diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 754e2862201c2..9d716afd6ed35 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -391,4 +391,6 @@ def get_feature_names_out(self, input_features=None): feature_names_out : ndarray of str Transformed feature names. """ + if input_features is None and hasattr(self, "feature_names_in_"): + input_features = self.feature_names_in_ return self._encoder.get_feature_names_out(input_features) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 815f8478d8bea..1470cc6e74c01 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -702,7 +702,10 @@ def get_feature_names_out(self, input_features=None): check_is_fitted(self) cats = self.categories_ if input_features is None: - input_features = ["x%d" % i for i in range(len(cats))] + if hasattr(self, "feature_names_in_"): + input_features = self.feature_names_in_ + else: + input_features = ["x%d" % i for i in range(len(cats))] elif len(input_features) != len(self.categories_): raise ValueError( "input_features should have length equal to number of " @@ -717,7 +720,7 @@ def get_feature_names_out(self, input_features=None): if self.drop_idx_ is not None and self.drop_idx_[i] is not None: names.pop(self.drop_idx_[i]) feature_names.extend(names) - return np.asarray(feature_names) + return np.asarray(feature_names, dtype=object) class OrdinalEncoder(_BaseEncoder): diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index fda79852f9b72..4e8b481b1a174 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -229,6 +229,9 @@ def get_feature_names_out(self, input_features=None): Transformed feature names. """ powers = self.powers_ + + if input_features is None and hasattr(self, "feature_names_in_"): + input_features = self.feature_names_in_ input_features = _make_feature_names( n_features=powers.shape[1], input_features=input_features ) @@ -245,7 +248,7 @@ def get_feature_names_out(self, input_features=None): else: name = "1" feature_names.append(name) - return np.asarray(feature_names) + return np.asarray(feature_names, dtype=object) def fit(self, X, y=None): """ @@ -694,12 +697,16 @@ def get_feature_names_out(self, input_features=None): """ n_splines = self.bsplines_[0].c.shape[0] if input_features is None: - input_features = ["x%d" % i for i in range(self.n_features_in_)] + if hasattr(self, "feature_names_in_"): + input_features = self.feature_names_in_ + else: + input_features = ["x%d" % i for i in range(self.n_features_in_)] + feature_names = [] for i in range(self.n_features_in_): for j in range(n_splines - 1 + self.include_bias): feature_names.append(f"{input_features[i]}_sp_{j}") - return np.asarray(feature_names) + return np.asarray(feature_names, dtype=object) def fit(self, X, y=None, sample_weight=None): """Compute knot positions of splines. diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index f6eee53314120..8f90ed486a858 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -50,6 +50,7 @@ check_dataframe_column_names_consistency, check_n_features_in_after_fitting, check_transformer_get_feature_names_out, + check_transformer_get_feature_names_out_pandas, ) @@ -389,3 +390,6 @@ def test_transformers_get_feature_names_out(transformer): check_transformer_get_feature_names_out( transformer.__class__.__name__, transformer ) + check_transformer_get_feature_names_out_pandas( + transformer.__class__.__name__, transformer + ) diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py index 1f55b46cd69ee..877630d8ff41f 100644 --- a/sklearn/utils/_feature_names.py +++ b/sklearn/utils/_feature_names.py @@ -24,4 +24,4 @@ def _make_feature_names(n_features, prefix="x", input_features=None): """ if input_features is not None: return np.asarray(input_features) - return np.array([f"{prefix}{i}" for i in range(n_features)]) + return np.asarray([f"{prefix}{i}" for i in range(n_features)]) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 30641215e60c3..338a6bb205e64 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3856,7 +3856,7 @@ def check_transformer_get_feature_names_out(name, transformer_orig): y_ = np.c_[np.asarray(y), np.asarray(y)] y_[::2, 1] *= 2 - X_pred = transformer.fit_transform(X, y=y_) + X_transform = transformer.fit_transform(X, y=y_) input_features = ["feature%d" % i for i in range(n_features)] feature_names_out = transformer.get_feature_names_out(input_features) @@ -3864,11 +3864,65 @@ def check_transformer_get_feature_names_out(name, transformer_orig): assert isinstance(feature_names_out, np.ndarray) assert all(isinstance(name, str) for name in feature_names_out) - if isinstance(X_pred, tuple): - assert ( - len(feature_names_out) == X_pred[0].shape[1] - ), f"Expected {X_pred[0].shape[1]} feature names, got {len(feature_names_out)}" + if isinstance(X_transform, tuple): + n_features_out = X_transform[0].shape[1] else: - assert ( - len(feature_names_out) == X_pred.shape[1] - ), f"Expected {X_pred.shape[1]} feature names, got {len(feature_names_out)}" + n_features_out = X_transform.shape[1] + + assert ( + len(feature_names_out) == n_features_out + ), f"Expected {n_features_out} feature names, got {len(feature_names_out)}" + + +def check_transformer_get_feature_names_out_pandas(name, transformer_orig): + try: + import pandas as pd + except ImportError: + raise SkipTest( + "pandas is not installed: not checking column name consistency for pandas" + ) + + tags = transformer_orig._get_tags() + if "2darray" not in tags["X_types"] or tags["no_validation"]: + return + + X, y = make_blobs( + n_samples=30, + centers=[[0, 0, 0], [1, 1, 1]], + random_state=0, + n_features=2, + cluster_std=0.1, + ) + X = StandardScaler().fit_transform(X) + X -= X.min() + + transformer = clone(transformer_orig) + X = _enforce_estimator_tags_x(transformer, X) + X = _pairwise_estimator_convert_X(X, transformer) + + n_features = X.shape[1] + set_random_state(transformer) + + y_ = y + if name in CROSS_DECOMPOSITION: + y_ = np.c_[np.asarray(y), np.asarray(y)] + y_[::2, 1] *= 2 + + feature_names_in = ["col%d" % i for i in range(n_features)] + df = pd.DataFrame(X, columns=feature_names_in) + X_transform = transformer.fit_transform(df, y=y_) + + feature_names_out_default = transformer.get_feature_names_out() + feature_names_in_explicit_names = transformer.get_feature_names_out( + feature_names_in + ) + assert_array_equal(feature_names_out_default, feature_names_in_explicit_names) + + if isinstance(X_transform, tuple): + n_features_out = X_transform[0].shape[1] + else: + n_features_out = X_transform.shape[1] + + assert ( + len(feature_names_out_default) == n_features_out + ), f"Expected {n_features_out} feature names, got {len(feature_names_out_default)}" From f1090df65d41b7baf2db360f9eb0a0562aca6bac Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 28 Aug 2021 12:53:31 -0400 Subject: [PATCH 071/100] TST Typo --- sklearn/preprocessing/tests/test_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index e4333f768ab53..4f87a09e056dd 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -958,7 +958,7 @@ def test_encoders_has_categorical_tags(Encoder): # TODO: Remove in 1.2 when get_feature_names is removed def test_one_hot_encoder_get_feature_names_deprecated(): - X = np.array([["cat", "dot"]], dtype=object).T + X = np.array([["cat", "dog"]], dtype=object).T enc = OneHotEncoder().fit(X) msg = "get_feature_names is deprecated in 1.0" From ae464664ec378158bc8d1f5619f1c6c8751d7a2d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 28 Aug 2021 13:22:38 -0400 Subject: [PATCH 072/100] TST Include transformers that define get_feature_names_out --- sklearn/tests/test_common.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 8f90ed486a858..b01eb3a726041 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -373,10 +373,18 @@ def test_pandas_column_name_consistency(estimator): "random_projection", ] + +def _include_in_get_feature_names_out_check(transformer): + if hasattr(transformer, "get_feature_names_out"): + return True + module = transformer.__module__.split(".")[1] + return module not in GET_FEATURES_OUT_MODULES_TO_IGNORE + + GET_FEATURES_OUT_ESTIMATORS = [ est for est in _tested_estimators("transformer") - if est.__module__.split(".")[1] not in GET_FEATURES_OUT_MODULES_TO_IGNORE + if _include_in_get_feature_names_out_check(est) ] From 2e2bdd8849baac9a5366e33ebde6d04a6e4c39cf Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 29 Aug 2021 16:41:28 -0400 Subject: [PATCH 073/100] BUG Fixes test for all array outputs --- sklearn/feature_extraction/_dict_vectorizer.py | 4 ++-- sklearn/preprocessing/_encoders.py | 2 +- sklearn/tests/test_pipeline.py | 16 ++++++++-------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 78ddb6506dc96..1148b4333736d 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -428,11 +428,11 @@ def restrict(self, support, indices=False): >>> X = v.fit_transform(D) >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1]) >>> v.get_feature_names_out() - ['bar', 'baz', 'foo'] + array(['bar', 'baz', 'foo'], ...) >>> v.restrict(support.get_support()) DictVectorizer() >>> v.get_feature_names_out() - ['bar', 'foo'] + array(['bar', 'foo'], ...) """ if not indices: support = np.where(support)[0] diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 1470cc6e74c01..f4099a26bbf90 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -326,7 +326,7 @@ class OneHotEncoder(_BaseEncoder): array([['Male', 1], [None, 2]], dtype=object) >>> enc.get_feature_names_out(['gender', 'group']) - ['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'] + array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...) One can always drop the first column for each feature: diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 978f96ca112b2..214a8259fe9f2 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -936,22 +936,22 @@ def test_set_feature_union_steps(get_names): ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) - assert ["m2__x2", "m3__x3"] == getattr(ft, get_names)() + assert_array_equal(["m2__x2", "m3__x3"], getattr(ft, get_names)()) # Directly setting attr ft.transformer_list = [("m5", mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) - assert ["m5__x5"] == getattr(ft, get_names)() + assert_array_equal(["m5__x5"], getattr(ft, get_names)()) # Using set_params ft.set_params(transformer_list=[("mock", mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) - assert ["mock__x3"] == getattr(ft, get_names)() + assert_array_equal(["mock__x3"], getattr(ft, get_names)()) # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) - assert ["mock__x5"] == getattr(ft, get_names)() + assert_array_equal(["mock__x5"], getattr(ft, get_names)()) # TODO: Remove in 1.2 when get_feature_names is removed. @@ -973,20 +973,20 @@ def test_set_feature_union_step_drop(get_names): ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) - assert ["m2__x2", "m3__x3"] == getattr(ft, get_names)() + assert_array_equal(["m2__x2", "m3__x3"], getattr(ft, get_names)()) with pytest.warns(None) as record: ft.set_params(m2="drop") assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) - assert ["m3__x3"] == getattr(ft, get_names)() + assert_array_equal(["m3__x3"], getattr(ft, get_names)()) assert not record with pytest.warns(None) as record: ft.set_params(m3="drop") assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) - assert [] == getattr(ft, get_names)() + assert_array_equal([], getattr(ft, get_names)()) assert not record with pytest.warns(None) as record: @@ -1000,7 +1000,7 @@ def test_set_feature_union_step_drop(get_names): ft = FeatureUnion([("m2", "drop"), ("m3", mult3)]) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) - assert ["m3__x3"] == getattr(ft, get_names)() + assert_array_equal(["m3__x3"], getattr(ft, get_names)()) assert not record From 557585753ba2316d1074904b7fa1fe5c80c67c5e Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 29 Aug 2021 18:02:31 -0400 Subject: [PATCH 074/100] ENH Adds prefix_feature_names_out='when_colliding' --- doc/modules/compose.rst | 12 +- ...linear_model_coefficient_interpretation.py | 1 + sklearn/compose/_column_transformer.py | 131 ++++++++++++++---- .../compose/tests/test_column_transformer.py | 96 +++++++++---- 4 files changed, 175 insertions(+), 65 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index d3d34bb17c0b5..c92ab9c0e4abb 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -152,13 +152,13 @@ pipeline slicing to get the feature names going into each step:: >>> pipe.fit(iris.data, iris.target) Pipeline(steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))]) >>> pipe[:-1].get_feature_names_out() - ['x2', 'x3'] + array(['x2', 'x3'], ...) You can also provide custom feature names for the input data using ``get_feature_names_out``:: >>> pipe[:-1].get_feature_names_out(iris.feature_names) - ['petal length (cm)', 'petal width (cm)'] + array(['petal length (cm)', 'petal width (cm)'], ...) .. topic:: Examples: @@ -457,11 +457,9 @@ By default, the remaining rating columns are ignored (``remainder='drop'``):: ('title_bow', CountVectorizer(), 'title')]) >>> column_trans.get_feature_names_out() - ['categories__city_London', 'categories__city_Paris', - 'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast', - 'title_bow__grapes', 'title_bow__his', 'title_bow__how', 'title_bow__last', - 'title_bow__learned', 'title_bow__moveable', 'title_bow__of', 'title_bow__the', - 'title_bow__trick', 'title_bow__watson', 'title_bow__wrath'] + array(['city_London', 'city_Paris', 'city_Sallisaw', 'bow', 'feast', + 'grapes', 'his', 'how', 'last', 'learned', 'moveable', 'of', 'the', + 'trick', 'watson', 'wrath'], ...) >>> column_trans.transform(X).toarray() array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0], diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 044ffea8a9920..2736e358f7871 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -206,6 +206,7 @@ .get_feature_names_out(input_features=categorical_columns) ) feature_names = np.concatenate([feature_names, numerical_columns]) + coefs = pd.DataFrame( model.named_steps["transformedtargetregressor"].regressor_.coef_, columns=["Coefficients"], diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 6628e6688de23..f4b7b88aca734 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -8,6 +8,7 @@ # License: BSD from itertools import chain from typing import Iterable +from collections import Counter import numpy as np from scipy import sparse @@ -112,6 +113,15 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): If True, the time elapsed while fitting each transformer will be printed as it is completed. + prefix_feature_names_out : {"when_colliding", "always"}, default="when_colliding" + Configures how :meth:`get_feature_names_out` adds prefixes to + feature names out: + + - `"when_colliding"` : Adds the transformer name as a prefix only when + the feature names out are collidating. + - `"always"` : Always add the transformer name as a prefix. + + Attributes ---------- transformers_ : list @@ -197,6 +207,7 @@ def __init__( n_jobs=None, transformer_weights=None, verbose=False, + prefix_feature_names_out="when_colliding", ): self.transformers = transformers self.remainder = remainder @@ -204,6 +215,7 @@ def __init__( self.n_jobs = n_jobs self.transformer_weights = transformer_weights self.verbose = verbose + self.prefix_feature_names_out = prefix_feature_names_out @property def _transformers(self): @@ -372,6 +384,43 @@ def named_transformers_(self): # Use Bunch object to improve autocomplete return Bunch(**{name: trans for name, trans, _ in self.transformers_}) + @deprecated( + "get_feature_names is deprecated in 1.0 and will be removed " + "in 1.2. You can use get_feature_names_out instead" + ) + def get_feature_names(self): + """Get feature names from all transformers. + + Returns + ------- + feature_names : list of strings + Names of the features produced by transform. + """ + check_is_fitted(self) + feature_names = [] + for name, trans, column, _ in self._iter(fitted=True): + if trans == "drop" or _is_empty_column_selection(column): + continue + if trans == "passthrough": + if hasattr(self, "feature_names_in_"): + if (not isinstance(column, slice)) and all( + isinstance(col, str) for col in column + ): + feature_names.extend(column) + else: + feature_names.extend(self.feature_names_in_[column]) + else: + indices = np.arange(self._n_features) + feature_names.extend(["x%d" % i for i in indices[column]]) + continue + if not hasattr(trans, "get_feature_names"): + raise AttributeError( + "Transformer %s (type %s) does not provide get_feature_names." + % (str(name), type(trans).__name__) + ) + feature_names.extend([f"{name}__{f}" for f in trans.get_feature_names()]) + return feature_names + def _get_feature_names_out(self, get_names): """Private function to be used by get_feature_names*.""" # TODO(1.2): This should be removed and integrated into @@ -395,30 +444,6 @@ def _get_feature_names_out(self, get_names): feature_names.extend(get_names(name, trans, column)) return feature_names - @deprecated( - "get_feature_names is deprecated in 1.0 and will be removed " - "in 1.2. You can use get_feature_names_out instead" - ) - def get_feature_names(self): - """Get feature names from all transformers. - - Returns - ------- - feature_names : list of str - Names of the features produced by transform. - """ - check_is_fitted(self) - - def get_names(name, trans, column): - if not hasattr(trans, "get_feature_names"): - raise AttributeError( - f"Transformer {name} (type {type(trans).__name__}) does " - "not provide get_feature_names." - ) - return [f"{name}__{f}" for f in trans.get_feature_names()] - - return self._get_feature_names_out(get_names) - def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. @@ -438,7 +463,22 @@ def get_feature_names_out(self, input_features=None): else: input_names = _make_feature_names(self.n_features_in_) - def get_names(name, trans, column): + def _get_feature_name_out_for_transformer(name, trans, column): + if trans == "drop" or _is_empty_column_selection(column): + return + elif trans == "passthrough": + if hasattr(self, "feature_names_in_"): + if (not isinstance(column, slice)) and all( + isinstance(col, str) for col in column + ): + return column + else: + return self.feature_names_in_[column] + else: + indices = np.arange(self.n_features_in_) + return ["x%d" % i for i in indices[column]] + + # An actual transformer if not hasattr(trans, "get_feature_names_out"): raise AttributeError( f"Transformer {name} (type {type(trans).__name__}) does " @@ -448,12 +488,43 @@ def get_names(name, trans, column): isinstance(col, str) for col in column ): column = _safe_indexing(input_names, column) - return [ - f"{name}__{f}" - for f in trans.get_feature_names_out(input_features=column) - ] + return trans.get_feature_names_out(input_features=column) - return self._get_feature_names_out(get_names) + # List of tuples (name, feature_names_out) + transformer_with_feature_names_out = [] + for name, trans, column, _ in self._iter(fitted=True): + feature_names_out = _get_feature_name_out_for_transformer( + name, trans, column + ) + if feature_names_out is None: + continue + transformer_with_feature_names_out.append((name, feature_names_out)) + + # always prefix the feature names out with the transformers name + if self.prefix_feature_names_out == "always": + names = list( + chain.from_iterable( + (f"{name}__{i}" for i in feature_names_out) + for name, feature_names_out in transformer_with_feature_names_out + ) + ) + return np.asarray(names, dtype=object) + + # prefix_feature_names_out == "when_colliding" + feature_names_count = Counter( + chain.from_iterable(s for _, s in transformer_with_feature_names_out) + ) + + output = [] + for transformer_name, feature_names in transformer_with_feature_names_out: + for feat_name in feature_names: + if feature_names_count[feat_name] == 1: + # unique + output.append(f"{feat_name}") + else: + # not unique + output.append(f"{transformer_name}__{feat_name}") + return np.asarray(output, dtype=object) def _update_fitted_transformers(self, transformers): # transformers are fitted; excludes 'drop' cases diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index deed2ca0fa13f..d49682c184a39 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -709,6 +709,7 @@ def test_column_transformer_get_set_params(): "trans2__with_std": True, "transformers": ct.transformers, "transformer_weights": None, + "prefix_feature_names_out": "when_colliding", "verbose": False, } @@ -729,6 +730,7 @@ def test_column_transformer_get_set_params(): "trans2__with_std": True, "transformers": ct.transformers, "transformer_weights": None, + "prefix_feature_names_out": "when_colliding", "verbose": False, } @@ -803,55 +805,53 @@ def test_column_transformer_get_feature_names(get_names): ) # TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") -@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) -def test_column_transformer_get_feature_names_pipeline(X, keys, get_names): +def test_column_transformer_get_feature_names_pipeline(X, keys): ct = ColumnTransformer([("col" + str(i), DictVectorizer(), i) for i in range(2)]) ct.fit(X) - assert getattr(ct, get_names)() == [f"col0__{key}" for key in keys[:2]] + [ + assert ct.get_feature_names() == [f"col0__{key}" for key in keys[:2]] + [ f"col1__{keys[2]}" ] # drop transformer ct = ColumnTransformer([("col0", DictVectorizer(), 0), ("col1", "drop", 1)]) ct.fit(X) - assert getattr(ct, get_names)() == [f"col0__{key}" for key in keys[:2]] + assert ct.get_feature_names() == [f"col0__{key}" for key in keys[:2]] # passthrough transformer ct = ColumnTransformer([("trans", "passthrough", [0, 1])]) ct.fit(X) - assert getattr(ct, get_names)() == ["x0", "x1"] + assert ct.get_feature_names() == ["x0", "x1"] ct = ColumnTransformer([("trans", DictVectorizer(), 0)], remainder="passthrough") ct.fit(X) - assert getattr(ct, get_names)() == [f"trans__{key}" for key in keys[:2]] + ["x1"] + assert ct.get_feature_names() == [f"trans__{key}" for key in keys[:2]] + ["x1"] ct = ColumnTransformer([("trans", "passthrough", [1])], remainder="passthrough") ct.fit(X) - assert getattr(ct, get_names)() == ["x1", "x0"] + assert ct.get_feature_names() == ["x1", "x0"] ct = ColumnTransformer( [("trans", "passthrough", lambda x: [1])], remainder="passthrough" ) ct.fit(X) - assert getattr(ct, get_names)() == ["x1", "x0"] + assert ct.get_feature_names() == ["x1", "x0"] ct = ColumnTransformer( [("trans", "passthrough", np.array([False, True]))], remainder="passthrough" ) ct.fit(X) - assert getattr(ct, get_names)() == ["x1", "x0"] + assert ct.get_feature_names() == ["x1", "x0"] ct = ColumnTransformer( [("trans", "passthrough", slice(1, 2))], remainder="passthrough" ) ct.fit(X) - assert getattr(ct, get_names)() == ["x1", "x0"] + assert ct.get_feature_names() == ["x1", "x0"] # TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") -@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) -def test_column_transformer_get_feature_names_dataframe(get_names): +def test_column_transformer_get_feature_names_dataframe(): # passthough transformer with a dataframe pd = pytest.importorskip("pandas") X = np.array( @@ -861,44 +861,44 @@ def test_column_transformer_get_feature_names_dataframe(get_names): ct = ColumnTransformer([("trans", "passthrough", ["col0", "col1"])]) ct.fit(X_df) - assert getattr(ct, get_names)() == ["col0", "col1"] + assert ct.get_feature_names() == ["col0", "col1"] ct = ColumnTransformer([("trans", "passthrough", [0, 1])]) ct.fit(X_df) - assert getattr(ct, get_names)() == ["col0", "col1"] + assert ct.get_feature_names() == ["col0", "col1"] ct = ColumnTransformer([("col0", DictVectorizer(), 0)], remainder="passthrough") ct.fit(X_df) - assert getattr(ct, get_names)() == ["col0__a", "col0__b", "col1"] + assert ct.get_feature_names() == ["col0__a", "col0__b", "col1"] ct = ColumnTransformer( [("trans", "passthrough", ["col1"])], remainder="passthrough" ) ct.fit(X_df) - assert getattr(ct, get_names)() == ["col1", "col0"] + assert ct.get_feature_names() == ["col1", "col0"] ct = ColumnTransformer( [("trans", "passthrough", lambda x: x[["col1"]].columns)], remainder="passthrough", ) ct.fit(X_df) - assert getattr(ct, get_names)() == ["col1", "col0"] + assert ct.get_feature_names() == ["col1", "col0"] ct = ColumnTransformer( [("trans", "passthrough", np.array([False, True]))], remainder="passthrough" ) ct.fit(X_df) - assert getattr(ct, get_names)() == ["col1", "col0"] + assert ct.get_feature_names() == ["col1", "col0"] ct = ColumnTransformer( [("trans", "passthrough", slice(1, 2))], remainder="passthrough" ) ct.fit(X_df) - assert getattr(ct, get_names)() == ["col1", "col0"] + assert ct.get_feature_names() == ["col1", "col0"] ct = ColumnTransformer([("trans", "passthrough", [1])], remainder="passthrough") ct.fit(X_df) - assert getattr(ct, get_names)() == ["col1", "col0"] + assert ct.get_feature_names() == ["col1", "col0"] def test_column_transformer_special_strings(): @@ -1149,6 +1149,7 @@ def test_column_transformer_get_set_params_with_remainder(): "trans1__with_std": True, "transformers": ct.transformers, "transformer_weights": None, + "prefix_feature_names_out": "when_colliding", "verbose": False, } @@ -1168,9 +1169,9 @@ def test_column_transformer_get_set_params_with_remainder(): "trans1": "passthrough", "transformers": ct.transformers, "transformer_weights": None, + "prefix_feature_names_out": "when_colliding", "verbose": False, } - assert ct.get_params() == exp @@ -1453,7 +1454,7 @@ def test_make_column_selector_pickle(): "get_names, expected_names", [ ("get_feature_names", ["ohe__x0_a", "ohe__x0_b", "ohe__x1_z"]), - ("get_feature_names_out", ["ohe__col1_a", "ohe__col1_b", "ohe__col2_z"]), + ("get_feature_names_out", ["col1_a", "col1_b", "col2_z"]), ], ) def test_feature_names_empty_columns(empty_col, get_names, expected_names): @@ -1469,7 +1470,7 @@ def test_feature_names_empty_columns(empty_col, get_names, expected_names): ) ct.fit(df) - assert getattr(ct, get_names)() == expected_names + assert_array_equal(getattr(ct, get_names)(), expected_names) @pytest.mark.parametrize( @@ -1490,7 +1491,7 @@ def test_feature_names_out_pandas(selector): ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)]) ct.fit(df) - assert ct.get_feature_names_out() == ["ohe__col2_z"] + assert_array_equal(ct.get_feature_names_out(), ["col2_z"]) @pytest.mark.parametrize( @@ -1502,7 +1503,7 @@ def test_feature_names_out_non_pandas(selector): ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)]) ct.fit(X) - assert ct.get_feature_names_out() == ["ohe__x1_z"] + assert_array_equal(ct.get_feature_names_out(), ["x1_z"]) @pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()]) @@ -1652,15 +1653,14 @@ def test_feature_name_validation_missing_columns_drop_passthough(): # TODO: Remove in 1.2 when get_feature_names is removed. @pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") -@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"]) @pytest.mark.parametrize("selector", [[], [False, False]]) -def test_get_feature_names_empty_selection(selector, get_names): +def test_get_feature_names_empty_selection(selector): """Test that get_feature_names is only called for transformers that were selected. Non-regression test for #19550. """ ct = ColumnTransformer([("ohe", OneHotEncoder(drop="first"), selector)]) ct.fit([[1, 2], [3, 4]]) - assert getattr(ct, get_names)() == [] + assert ct.get_feature_names() == [] def test_feature_names_in_(): @@ -1679,3 +1679,43 @@ def test_feature_names_in_(): ct.fit(df) assert_array_equal(ct.feature_names_in_, feature_names) + + +class TransWithNames(Trans): + def get_feature_names_out(self, input_features=None): + return input_features + + +def test_feature_names_out_prefix_always(): + """Check feature_names_out for prefix_feature_names_out='always'""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame([[1, 2, 3]], columns=["a", "c", "d"]) + ct = ColumnTransformer( + [("bycol1", TransWithNames(), ["a", "c"]), ("bycol2", "passthrough", ["a"])], + remainder="passthrough", + prefix_feature_names_out="always", + ) + ct.fit(df) + + expected = ["bycol1__a", "bycol1__c", "bycol2__a", "remainder__d"] + names = ct.get_feature_names_out() + assert isinstance(names, np.ndarray) + assert_array_equal(names, expected) + + +def test_feature_names_out_prefix_when_colliding(): + """Check feature_names_out for prefix_feature_names_out='when_colliding'""" + + pd = pytest.importorskip("pandas") + df = pd.DataFrame([[1, 2, 3]], columns=["a", "c", "d"]) + ct = ColumnTransformer( + [("bycol1", TransWithNames(), ["a", "c"]), ("bycol2", "passthrough", ["a"])], + remainder="passthrough", + prefix_feature_names_out="when_colliding", + ) + ct.fit(df) + + # "a" is colliding but "c" is not + expected = ["bycol1__a", "c", "bycol2__a", "d"] + names = ct.get_feature_names_out() + assert_array_equal(names, expected) From 3d1546b5e4b7cf71db670f4926aa949b483d18be Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 29 Aug 2021 18:09:53 -0400 Subject: [PATCH 075/100] CLN Cleaner code --- sklearn/compose/_column_transformer.py | 65 ++++++++++++++------------ 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index f4b7b88aca734..3057a91b609d8 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -444,6 +444,36 @@ def _get_feature_names_out(self, get_names): feature_names.extend(get_names(name, trans, column)) return feature_names + def _get_feature_name_out_for_transformer( + self, name, trans, column, feature_names_in + ): + """Gets feature names of transformer. + + Used conduction with self._iter(fitted=True) in get_feature_names_out.""" + + if trans == "drop" or _is_empty_column_selection(column): + return + elif trans == "passthrough": + if (not isinstance(column, slice)) and all( + isinstance(col, str) for col in column + ): + # selection was already strings + return column + else: + return feature_names_in[column] + + # An actual transformer + if not hasattr(trans, "get_feature_names_out"): + raise AttributeError( + f"Transformer {name} (type {type(trans).__name__}) does " + "not provide get_feature_names_out." + ) + if isinstance(column, Iterable) and not all( + isinstance(col, str) for col in column + ): + column = _safe_indexing(feature_names_in, column) + return trans.get_feature_names_out(input_features=column) + def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. @@ -459,42 +489,15 @@ def get_feature_names_out(self, input_features=None): """ check_is_fitted(self) if hasattr(self, "feature_names_in_"): - input_names = self.feature_names_in_ + feature_names_in = self.feature_names_in_ else: - input_names = _make_feature_names(self.n_features_in_) - - def _get_feature_name_out_for_transformer(name, trans, column): - if trans == "drop" or _is_empty_column_selection(column): - return - elif trans == "passthrough": - if hasattr(self, "feature_names_in_"): - if (not isinstance(column, slice)) and all( - isinstance(col, str) for col in column - ): - return column - else: - return self.feature_names_in_[column] - else: - indices = np.arange(self.n_features_in_) - return ["x%d" % i for i in indices[column]] - - # An actual transformer - if not hasattr(trans, "get_feature_names_out"): - raise AttributeError( - f"Transformer {name} (type {type(trans).__name__}) does " - "not provide get_feature_names_out." - ) - if isinstance(column, Iterable) and not all( - isinstance(col, str) for col in column - ): - column = _safe_indexing(input_names, column) - return trans.get_feature_names_out(input_features=column) + feature_names_in = _make_feature_names(self.n_features_in_) # List of tuples (name, feature_names_out) transformer_with_feature_names_out = [] for name, trans, column, _ in self._iter(fitted=True): - feature_names_out = _get_feature_name_out_for_transformer( - name, trans, column + feature_names_out = self._get_feature_name_out_for_transformer( + name, trans, column, feature_names_in ) if feature_names_out is None: continue From 6e44a5246bc1e9ecaa4714f8df0598e515de13de Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 29 Aug 2021 18:14:48 -0400 Subject: [PATCH 076/100] ENH Validates prefix_feature_names_out --- sklearn/compose/_column_transformer.py | 4 ++++ sklearn/compose/tests/test_column_transformer.py | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 3057a91b609d8..c346c03a9c224 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -678,6 +678,10 @@ def fit_transform(self, X, y=None): self._validate_transformers() self._validate_column_callables(X) self._validate_remainder(X) + if self.prefix_feature_names_out not in ("when_colliding", "always"): + raise ValueError( + "prefix_feature_names_out must be either 'when_colliding' or 'always'" + ) result = self._fit_transform(X, y, _fit_transform_one) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index d49682c184a39..b1c3d5444a62f 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1681,6 +1681,17 @@ def test_feature_names_in_(): assert_array_equal(ct.feature_names_in_, feature_names) +def test_feature_names_out_prefix_invalid(): + """Check error is raised for invalid prefix_feature_names_out""" + ct = ColumnTransformer( + [("bycol1", TransWithNames(), [0, 1]), ("bycol2", "passthrough", [1])], + prefix_feature_names_out="bad", + ) + msg = "prefix_feature_names_out must be either 'when_colliding' or" + with pytest.raises(ValueError, match=msg): + ct.fit(np.array([[0, 1], [2, 3]])) + + class TransWithNames(Trans): def get_feature_names_out(self, input_features=None): return input_features From b07a3bc52ab96814f4a43512558ab034d83aed06 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 29 Aug 2021 18:41:33 -0400 Subject: [PATCH 077/100] ENH convert to ndarray for vectorizers --- doc/modules/feature_extraction.rst | 38 +++++++++---------- sklearn/feature_extraction/tests/test_text.py | 10 +++-- sklearn/feature_extraction/text.py | 22 ++++++----- 3 files changed, 37 insertions(+), 33 deletions(-) diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst index 9307a5fb17f86..a07d722defb9b 100644 --- a/doc/modules/feature_extraction.rst +++ b/doc/modules/feature_extraction.rst @@ -54,7 +54,7 @@ is a traditional numerical feature:: [ 0., 0., 1., 18.]]) >>> vec.get_feature_names_out() - ['city=Dubai', 'city=London', 'city=San Francisco', 'temperature'] + array(['city=Dubai', 'city=London', 'city=San Francisco', 'temperature'], ...) :class:`DictVectorizer` accepts multiple string values for one feature, like, e.g., multiple categories for a movie. @@ -69,10 +69,9 @@ and its year of release. array([[0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 2.003e+03], [1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.011e+03], [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.974e+03]]) - >>> vec.get_feature_names_out() == ['category=animation', 'category=drama', - ... 'category=family', 'category=thriller', - ... 'year'] - True + >>> vec.get_feature_names_out() + array(['category=animation', 'category=drama', 'category=family', + 'category=thriller', 'year'], ...) >>> vec.transform({'category': ['thriller'], ... 'unseen_feature': '3'}).toarray() array([[0., 0., 0., 1., 0.]]) @@ -112,7 +111,8 @@ suitable for feeding into a classifier (maybe after being piped into a >>> pos_vectorized.toarray() array([[1., 1., 1., 1., 1., 1.]]) >>> vec.get_feature_names_out() - ['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the'] + array(['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', + 'word-2=the'], ...) As you can imagine, if one extracts such a context around each individual word of a corpus of documents the resulting matrix will be very wide @@ -340,10 +340,9 @@ Each term found by the analyzer during the fit is assigned a unique integer index corresponding to a column in the resulting matrix. This interpretation of the columns can be retrieved as follows:: - >>> vectorizer.get_feature_names_out() == ( - ... ['and', 'document', 'first', 'is', 'one', - ... 'second', 'the', 'third', 'this']) - True + >>> vectorizer.get_feature_names_out() + array(['and', 'document', 'first', 'is', 'one', 'second', 'the', + 'third', 'this'], ...) >>> X.toarray() array([[0, 1, 1, 1, 0, 0, 1, 0, 1], @@ -406,8 +405,8 @@ however, similar words are useful for prediction, such as in classifying writing style or personality. There are several known issues in our provided 'english' stop word list. It -does not aim to be a general, 'one-size-fits-all' solution as some tasks -may require a more custom solution. See [NQY18]_ for more details. +does not aim to be a general, 'one-size-fits-all' solution as some tasks +may require a more custom solution. See [NQY18]_ for more details. Please take care in choosing a stop word list. Popular stop word lists may include words that are highly informative to @@ -742,9 +741,8 @@ decide better:: >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2)) >>> counts = ngram_vectorizer.fit_transform(['words', 'wprds']) - >>> ngram_vectorizer.get_feature_names_out() == ( - ... [' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp']) - True + >>> ngram_vectorizer.get_feature_names_out() + array([' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp'], ...) >>> counts.toarray().astype(int) array([[1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 0, 1, 1, 1, 0, 1]]) @@ -758,17 +756,15 @@ span across words:: >>> ngram_vectorizer.fit_transform(['jumpy fox']) <1x4 sparse matrix of type '<... 'numpy.int64'>' with 4 stored elements in Compressed Sparse ... format> - >>> ngram_vectorizer.get_feature_names_out() == ( - ... [' fox ', ' jump', 'jumpy', 'umpy ']) - True + >>> ngram_vectorizer.get_feature_names_out() + array([' fox ', ' jump', 'jumpy', 'umpy '], ...) >>> ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5)) >>> ngram_vectorizer.fit_transform(['jumpy fox']) <1x5 sparse matrix of type '<... 'numpy.int64'>' with 5 stored elements in Compressed Sparse ... format> - >>> ngram_vectorizer.get_feature_names_out() == ( - ... ['jumpy', 'mpy f', 'py fo', 'umpy ', 'y fox']) - True + >>> ngram_vectorizer.get_feature_names_out() + array(['jumpy', 'mpy f', 'py fo', 'umpy ', 'y fox'], ...) The word boundaries-aware variant ``char_wb`` is especially interesting for languages that use white-spaces for word separation as it generates diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 1215a92969142..abb408669c0c6 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -412,7 +412,8 @@ def test_countvectorizer_custom_token_pattern(get_names): vectorizer = CountVectorizer(token_pattern=token_pattern) vectorizer.fit_transform(corpus) expected = ["document", "one", "sample"] - assert getattr(vectorizer, get_names)() == expected + feature_names_out = getattr(vectorizer, get_names)() + assert_array_equal(feature_names_out, expected) def test_countvectorizer_custom_token_pattern_with_several_group(): @@ -665,6 +666,9 @@ def test_feature_names(get_names): assert len(cv.vocabulary_) == n_features feature_names = getattr(cv, get_names)() + if get_names == "get_feature_names_out": + assert isinstance(feature_names, np.ndarray) + assert len(feature_names) == n_features assert_array_equal( [ @@ -1096,7 +1100,7 @@ def test_countvectorizer_vocab_sets_when_pickling(get_names): unpickled_cv = pickle.loads(pickle.dumps(cv)) cv.fit(ALL_FOOD_DOCS) unpickled_cv.fit(ALL_FOOD_DOCS) - assert getattr(cv, get_names)() == getattr(unpickled_cv, get_names)() + assert_array_equal(getattr(cv, get_names)(), getattr(unpickled_cv, get_names)()) # TODO: Remove in 1.2 when get_feature_names is removed. @@ -1126,7 +1130,7 @@ def test_countvectorizer_vocab_dicts_when_pickling(get_names): unpickled_cv = pickle.loads(pickle.dumps(cv)) cv.fit(ALL_FOOD_DOCS) unpickled_cv.fit(ALL_FOOD_DOCS) - assert getattr(cv, get_names)() == getattr(unpickled_cv, get_names)() + assert_array_equal(getattr(cv, get_names)(), getattr(unpickled_cv, get_names)()) def test_stop_words_removal(): diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index b6a0c0a479c76..6483420de283b 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1049,8 +1049,9 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): ... ] >>> vectorizer = CountVectorizer() >>> X = vectorizer.fit_transform(corpus) - >>> print(vectorizer.get_feature_names_out()) - ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] + >>> vectorizer.get_feature_names_out() + array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', + 'this'], ...) >>> print(X.toarray()) [[0 1 1 1 0 0 1 0 1] [0 2 0 1 0 1 1 0 1] @@ -1058,10 +1059,10 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): [0 1 1 1 0 0 1 0 1]] >>> vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2)) >>> X2 = vectorizer2.fit_transform(corpus) - >>> print(vectorizer2.get_feature_names_out()) - ['and this', 'document is', 'first document', 'is the', 'is this', - 'second document', 'the first', 'the second', 'the third', 'third one', - 'this document', 'this is', 'this the'] + >>> vectorizer2.get_feature_names_out() + array(['and this', 'document is', 'first document', 'is the', 'is this', + 'second document', 'the first', 'the second', 'the third', 'third one', + 'this document', 'this is', 'this the'], ...) >>> print(X2.toarray()) [[0 0 1 1 0 0 1 0 0 0 0 1 0] [0 1 0 1 0 1 0 1 0 0 1 0 0] @@ -1416,7 +1417,9 @@ def get_feature_names_out(self, input_features=None): """ self._check_vocabulary() - return [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))] + return np.array( + [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))] + ) def _more_tags(self): return {"X_types": ["string"]} @@ -1872,8 +1875,9 @@ class TfidfVectorizer(CountVectorizer): ... ] >>> vectorizer = TfidfVectorizer() >>> X = vectorizer.fit_transform(corpus) - >>> print(vectorizer.get_feature_names_out()) - ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] + >>> vectorizer.get_feature_names_out() + array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', + 'this'], ...) >>> print(X.shape) (4, 9) """ From fffabf0275b50f4594fc8a56817cd66f2a05df8e Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 29 Aug 2021 18:42:57 -0400 Subject: [PATCH 078/100] ENH Less restrictive ndarray dtype --- sklearn/compose/_column_transformer.py | 4 ++-- sklearn/feature_extraction/text.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index c346c03a9c224..f714c66d3affc 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -511,7 +511,7 @@ def get_feature_names_out(self, input_features=None): for name, feature_names_out in transformer_with_feature_names_out ) ) - return np.asarray(names, dtype=object) + return np.asarray(names) # prefix_feature_names_out == "when_colliding" feature_names_count = Counter( @@ -527,7 +527,7 @@ def get_feature_names_out(self, input_features=None): else: # not unique output.append(f"{transformer_name}__{feat_name}") - return np.asarray(output, dtype=object) + return np.asarray(output) def _update_fitted_transformers(self, transformers): # transformers are fitted; excludes 'drop' cases diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 6483420de283b..77312427a15ae 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1417,7 +1417,7 @@ def get_feature_names_out(self, input_features=None): """ self._check_vocabulary() - return np.array( + return np.asarray( [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))] ) From 5def4cebdd9d9d03f53ef1aadb327be8a96d0e7f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 15:32:11 -0400 Subject: [PATCH 079/100] ENH Adds prefix_feature_names_out as a bool --- doc/modules/compose.rst | 5 +- doc/whats_new/v1.0.rst | 4 + sklearn/compose/_column_transformer.py | 58 ++-- .../compose/tests/test_column_transformer.py | 283 +++++++++++++++--- 4 files changed, 287 insertions(+), 63 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index c92ab9c0e4abb..d4dacb6d723ea 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -449,10 +449,11 @@ By default, the remaining rating columns are ignored (``remainder='drop'``):: >>> column_trans = ColumnTransformer( ... [('categories', OneHotEncoder(dtype='int'), ['city']), ... ('title_bow', CountVectorizer(), 'title')], - ... remainder='drop') + ... remainder='drop', prefix_feature_names_out=False) >>> column_trans.fit(X) - ColumnTransformer(transformers=[('categories', OneHotEncoder(dtype='int'), + ColumnTransformer(prefix_feature_names_out=False, + transformers=[('categories', OneHotEncoder(dtype='int'), ['city']), ('title_bow', CountVectorizer(), 'title')]) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 307cab44bd080..176886b000675 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -220,6 +220,10 @@ Changelog :mod:`sklearn.compose` ...................... +- |API| Adds `prefix_feature_names_out` to :class:`compose.ColumnTransformer`. + This flag controls the prefixing of feature names out in + :term:`get_feature_names_out`. :pr:`18444` by `Thomas Fan`_. + - |Enhancement| :class:`compose.ColumnTransformer` now records the output of each transformer in `output_indices_`. :pr:`18393` by :user:`Luca Bittarello `. diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index c6072f09ccee4..3304e68413a5a 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -113,14 +113,13 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): If True, the time elapsed while fitting each transformer will be printed as it is completed. - prefix_feature_names_out : {"when_colliding", "always"}, default="when_colliding" - Configures how :meth:`get_feature_names_out` adds prefixes to - feature names out: - - - `"when_colliding"` : Adds the transformer name as a prefix only when - the feature names out are collidating. - - `"always"` : Always add the transformer name as a prefix. + prefix_feature_names_out : bool, default=True + If True, :meth:`get_feature_names_out` will prefix all feature names + with the name of the transformer that generated that feature. + If False, :meth:`get_feature_names_out` will not prefix any feature + names and will error if feature names collide. + .. versionadded:: 1.0 Attributes ---------- @@ -206,7 +205,7 @@ def __init__( n_jobs=None, transformer_weights=None, verbose=False, - prefix_feature_names_out="when_colliding", + prefix_feature_names_out=True, ): self.transformers = transformers self.remainder = remainder @@ -507,31 +506,44 @@ def get_feature_names_out(self, input_features=None): continue transformer_with_feature_names_out.append((name, feature_names_out)) + if not transformer_with_feature_names_out: + # No feature names + return np.array([], dtype=object) + # always prefix the feature names out with the transformers name - if self.prefix_feature_names_out == "always": + if self.prefix_feature_names_out: names = list( chain.from_iterable( (f"{name}__{i}" for i in feature_names_out) for name, feature_names_out in transformer_with_feature_names_out ) ) - return np.asarray(names) + return np.asarray(names, dtype=object) - # prefix_feature_names_out == "when_colliding" + # prefix_feature_names_out==False + # Check that names are all unique without a prefix feature_names_count = Counter( chain.from_iterable(s for _, s in transformer_with_feature_names_out) ) + top_6_overlap = [ + name for name, count in feature_names_count.most_common(6) if count > 1 + ] + top_6_overlap.sort() + if top_6_overlap: + if len(top_6_overlap) == 6: + # There are more than 5 overlapping names, we only show the 5 + # of the feature names + names_repr = str(top_6_overlap[:5])[:-1] + ", ...]" + else: + names_repr = str(top_6_overlap) + raise ValueError( + f"Output feature names: {names_repr} are not unique. Please set " + "prefix_feature_names_out=True to add prefixes to feature names" + ) - output = [] - for transformer_name, feature_names in transformer_with_feature_names_out: - for feat_name in feature_names: - if feature_names_count[feat_name] == 1: - # unique - output.append(f"{feat_name}") - else: - # not unique - output.append(f"{transformer_name}__{feat_name}") - return np.asarray(output) + return np.concatenate( + [name for _, name in transformer_with_feature_names_out], dtype=object + ) def _update_fitted_transformers(self, transformers): # transformers are fitted; excludes 'drop' cases @@ -680,10 +692,6 @@ def fit_transform(self, X, y=None): self._validate_transformers() self._validate_column_callables(X) self._validate_remainder(X) - if self.prefix_feature_names_out not in ("when_colliding", "always"): - raise ValueError( - "prefix_feature_names_out must be either 'when_colliding' or 'always'" - ) result = self._fit_transform(X, y, _fit_transform_one) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index b1c3d5444a62f..8f024cd8e6c6f 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -709,7 +709,7 @@ def test_column_transformer_get_set_params(): "trans2__with_std": True, "transformers": ct.transformers, "transformer_weights": None, - "prefix_feature_names_out": "when_colliding", + "prefix_feature_names_out": True, "verbose": False, } @@ -730,7 +730,7 @@ def test_column_transformer_get_set_params(): "trans2__with_std": True, "transformers": ct.transformers, "transformer_weights": None, - "prefix_feature_names_out": "when_colliding", + "prefix_feature_names_out": True, "verbose": False, } @@ -1149,7 +1149,7 @@ def test_column_transformer_get_set_params_with_remainder(): "trans1__with_std": True, "transformers": ct.transformers, "transformer_weights": None, - "prefix_feature_names_out": "when_colliding", + "prefix_feature_names_out": True, "verbose": False, } @@ -1169,7 +1169,7 @@ def test_column_transformer_get_set_params_with_remainder(): "trans1": "passthrough", "transformers": ct.transformers, "transformer_weights": None, - "prefix_feature_names_out": "when_colliding", + "prefix_feature_names_out": True, "verbose": False, } assert ct.get_params() == exp @@ -1454,7 +1454,7 @@ def test_make_column_selector_pickle(): "get_names, expected_names", [ ("get_feature_names", ["ohe__x0_a", "ohe__x0_b", "ohe__x1_z"]), - ("get_feature_names_out", ["col1_a", "col1_b", "col2_z"]), + ("get_feature_names_out", ["ohe__col1_a", "ohe__col1_b", "ohe__col2_z"]), ], ) def test_feature_names_empty_columns(empty_col, get_names, expected_names): @@ -1491,7 +1491,7 @@ def test_feature_names_out_pandas(selector): ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)]) ct.fit(df) - assert_array_equal(ct.get_feature_names_out(), ["col2_z"]) + assert_array_equal(ct.get_feature_names_out(), ["ohe__col2_z"]) @pytest.mark.parametrize( @@ -1503,7 +1503,7 @@ def test_feature_names_out_non_pandas(selector): ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)]) ct.fit(X) - assert_array_equal(ct.get_feature_names_out(), ["x1_z"]) + assert_array_equal(ct.get_feature_names_out(), ["ohe__x1_z"]) @pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()]) @@ -1681,52 +1681,263 @@ def test_feature_names_in_(): assert_array_equal(ct.feature_names_in_, feature_names) -def test_feature_names_out_prefix_invalid(): - """Check error is raised for invalid prefix_feature_names_out""" - ct = ColumnTransformer( - [("bycol1", TransWithNames(), [0, 1]), ("bycol2", "passthrough", [1])], - prefix_feature_names_out="bad", - ) - msg = "prefix_feature_names_out must be either 'when_colliding' or" - with pytest.raises(ValueError, match=msg): - ct.fit(np.array([[0, 1], [2, 3]])) - - class TransWithNames(Trans): + def __init__(self, feature_names_out=None): + self.feature_names_out = feature_names_out + def get_feature_names_out(self, input_features=None): + if self.feature_names_out is not None: + return self.feature_names_out return input_features -def test_feature_names_out_prefix_always(): - """Check feature_names_out for prefix_feature_names_out='always'""" +@pytest.mark.parametrize( + "transformers, remainder, expected_names", + [ + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", ["d"]), + ], + "passthrough", + ["bycol1__d", "bycol1__c", "bycol2__d", "remainder__a", "remainder__b"], + ), + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", ["d"]), + ], + "drop", + ["bycol1__d", "bycol1__c", "bycol2__d"], + ), + ( + [ + ("bycol1", TransWithNames(), ["b"]), + ("bycol2", "drop", ["d"]), + ], + "passthrough", + ["bycol1__b", "remainder__a", "remainder__c"], + ), + ( + [ + ("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]), + ], + "passthrough", + ["bycol1__pca1", "bycol1__pca2", "remainder__c"], + ), + ( + [ + ("bycol1", TransWithNames(["a", "b"]), ["d"]), + ("bycol2", "passthrough", ["b"]), + ], + "drop", + ["bycol1__a", "bycol1__b", "bycol2__b"], + ), + ( + [ + ("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]), + ("bycol2", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]), + ], + "passthrough", + [ + "bycol1__pca0", + "bycol1__pca1", + "bycol2__pca0", + "bycol2__pca1", + "remainder__a", + "remainder__c", + "remainder__d", + ], + ), + ( + [ + ("bycol1", "drop", ["d"]), + ], + "drop", + [], + ), + ], +) +def test_feature_names_out_prefix_true(transformers, remainder, expected_names): + """Check feature_names_out for prefix_feature_names_out==True (default)""" pd = pytest.importorskip("pandas") - df = pd.DataFrame([[1, 2, 3]], columns=["a", "c", "d"]) + df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"]) ct = ColumnTransformer( - [("bycol1", TransWithNames(), ["a", "c"]), ("bycol2", "passthrough", ["a"])], - remainder="passthrough", - prefix_feature_names_out="always", + transformers, + remainder=remainder, ) ct.fit(df) - expected = ["bycol1__a", "bycol1__c", "bycol2__a", "remainder__d"] names = ct.get_feature_names_out() assert isinstance(names, np.ndarray) - assert_array_equal(names, expected) + assert names.dtype == object + assert_array_equal(names, expected_names) -def test_feature_names_out_prefix_when_colliding(): - """Check feature_names_out for prefix_feature_names_out='when_colliding'""" - +@pytest.mark.parametrize( + "transformers, remainder, expected_names", + [ + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", ["a"]), + ], + "passthrough", + ["d", "c", "a", "b"], + ), + ( + [ + ("bycol1", TransWithNames(["a"]), ["d", "c"]), + ("bycol2", "passthrough", ["d"]), + ], + "drop", + ["a", "d"], + ), + ( + [ + ("bycol1", TransWithNames(), ["b"]), + ("bycol2", "drop", ["d"]), + ], + "passthrough", + ["b", "a", "c"], + ), + ( + [ + ("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]), + ], + "passthrough", + ["pca1", "pca2", "c"], + ), + ( + [ + ("bycol1", TransWithNames(["a", "c"]), ["d"]), + ("bycol2", "passthrough", ["d"]), + ], + "drop", + ["a", "c", "d"], + ), + ( + [ + ("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]), + ("bycol2", TransWithNames([f"kpca{i}" for i in range(2)]), ["b"]), + ], + "passthrough", + ["pca0", "pca1", "kpca0", "kpca1", "a", "c", "d"], + ), + ( + [ + ("bycol1", "drop", ["d"]), + ], + "drop", + [], + ), + ], +) +def test_feature_names_out_prefix_false(transformers, remainder, expected_names): + """Check feature_names_out for prefix_feature_names_out==True (default)""" pd = pytest.importorskip("pandas") - df = pd.DataFrame([[1, 2, 3]], columns=["a", "c", "d"]) + df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"]) ct = ColumnTransformer( - [("bycol1", TransWithNames(), ["a", "c"]), ("bycol2", "passthrough", ["a"])], - remainder="passthrough", - prefix_feature_names_out="when_colliding", + transformers, + remainder=remainder, + prefix_feature_names_out=False, ) ct.fit(df) - # "a" is colliding but "c" is not - expected = ["bycol1__a", "c", "bycol2__a", "d"] names = ct.get_feature_names_out() - assert_array_equal(names, expected) + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, expected_names) + + +@pytest.mark.parametrize( + "transformers, remainder, colliding_columns", + [ + ( + [ + ("bycol1", TransWithNames(), ["b"]), + ("bycol2", "passthrough", ["b"]), + ], + "drop", + "['b']", + ), + ( + [ + ("bycol1", TransWithNames(["c", "d"]), ["c"]), + ("bycol2", "passthrough", ["c"]), + ], + "drop", + "['c']", + ), + ( + [ + ("bycol1", TransWithNames(["a"]), ["b"]), + ("bycol2", "passthrough", ["b"]), + ], + "passthrough", + "['a']", + ), + ( + [ + ("bycol1", TransWithNames(["a"]), ["b"]), + ("bycol2", "drop", ["b"]), + ], + "passthrough", + "['a']", + ), + ( + [ + ("bycol1", TransWithNames(["c", "b"]), ["b"]), + ("bycol2", "passthrough", ["c", "b"]), + ], + "drop", + "['b', 'c']", + ), + ( + [ + ("bycol1", TransWithNames(["a"]), ["b"]), + ("bycol2", "passthrough", ["a"]), + ("bycol3", TransWithNames(["a"]), ["b"]), + ], + "passthrough", + "['a']", + ), + ( + [ + ("bycol1", TransWithNames(["a", "b"]), ["b"]), + ("bycol2", "passthrough", ["a"]), + ("bycol3", TransWithNames(["b"]), ["c"]), + ], + "passthrough", + "['a', 'b']", + ), + ( + [ + ("bycol1", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]), + ("bycol2", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]), + ], + "passthrough", + "['pca0', 'pca1', 'pca2', 'pca3', 'pca4', ...]", + ), + ], +) +def test_feature_names_out_prefix_false_errors( + transformers, remainder, colliding_columns +): + """Check feature_names_out for prefix_feature_names_out==False""" + + pd = pytest.importorskip("pandas") + df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"]) + ct = ColumnTransformer( + transformers, + remainder=remainder, + prefix_feature_names_out=False, + ) + ct.fit(df) + + msg = re.escape( + f"Output feature names: {colliding_columns} are not unique. Please set " + "prefix_feature_names_out=True to add prefixes to feature names" + ) + with pytest.raises(ValueError, match=msg): + ct.get_feature_names_out() From 1fda1c14071cc3d77df0ad895b4d68bd4fcce6d2 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 15:36:42 -0400 Subject: [PATCH 080/100] DOC Remove use of deprecated api --- ...ot_document_classification_20newsgroups.py | 215 ++++++++++-------- 1 file changed, 118 insertions(+), 97 deletions(-) diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py index 0b39ee49146c1..a7c88cd96d296 100644 --- a/examples/text/plot_document_classification_20newsgroups.py +++ b/examples/text/plot_document_classification_20newsgroups.py @@ -45,40 +45,60 @@ # Display progress logs on stdout -logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") op = OptionParser() -op.add_option("--report", - action="store_true", dest="print_report", - help="Print a detailed classification report.") -op.add_option("--chi2_select", - action="store", type="int", dest="select_chi2", - help="Select some number of features using a chi-squared test") -op.add_option("--confusion_matrix", - action="store_true", dest="print_cm", - help="Print the confusion matrix.") -op.add_option("--top10", - action="store_true", dest="print_top10", - help="Print ten most discriminative terms per class" - " for every classifier.") -op.add_option("--all_categories", - action="store_true", dest="all_categories", - help="Whether to use all categories or not.") -op.add_option("--use_hashing", - action="store_true", - help="Use a hashing vectorizer.") -op.add_option("--n_features", - action="store", type=int, default=2 ** 16, - help="n_features when using the hashing vectorizer.") -op.add_option("--filtered", - action="store_true", - help="Remove newsgroup information that is easily overfit: " - "headers, signatures, and quoting.") +op.add_option( + "--report", + action="store_true", + dest="print_report", + help="Print a detailed classification report.", +) +op.add_option( + "--chi2_select", + action="store", + type="int", + dest="select_chi2", + help="Select some number of features using a chi-squared test", +) +op.add_option( + "--confusion_matrix", + action="store_true", + dest="print_cm", + help="Print the confusion matrix.", +) +op.add_option( + "--top10", + action="store_true", + dest="print_top10", + help="Print ten most discriminative terms per class for every classifier.", +) +op.add_option( + "--all_categories", + action="store_true", + dest="all_categories", + help="Whether to use all categories or not.", +) +op.add_option("--use_hashing", action="store_true", help="Use a hashing vectorizer.") +op.add_option( + "--n_features", + action="store", + type=int, + default=2 ** 16, + help="n_features when using the hashing vectorizer.", +) +op.add_option( + "--filtered", + action="store_true", + help=( + "Remove newsgroup information that is easily overfit: " + "headers, signatures, and quoting." + ), +) def is_interactive(): - return not hasattr(sys.modules['__main__'], '__file__') + return not hasattr(sys.modules["__main__"], "__file__") # work-around for Jupyter notebook and IPython console @@ -103,44 +123,44 @@ def is_interactive(): categories = None else: categories = [ - 'alt.atheism', - 'talk.religion.misc', - 'comp.graphics', - 'sci.space', + "alt.atheism", + "talk.religion.misc", + "comp.graphics", + "sci.space", ] if opts.filtered: - remove = ('headers', 'footers', 'quotes') + remove = ("headers", "footers", "quotes") else: remove = () print("Loading 20 newsgroups dataset for categories:") print(categories if categories else "all") -data_train = fetch_20newsgroups(subset='train', categories=categories, - shuffle=True, random_state=42, - remove=remove) +data_train = fetch_20newsgroups( + subset="train", categories=categories, shuffle=True, random_state=42, remove=remove +) -data_test = fetch_20newsgroups(subset='test', categories=categories, - shuffle=True, random_state=42, - remove=remove) -print('data loaded') +data_test = fetch_20newsgroups( + subset="test", categories=categories, shuffle=True, random_state=42, remove=remove +) +print("data loaded") # order of labels in `target_names` can be different from `categories` target_names = data_train.target_names def size_mb(docs): - return sum(len(s.encode('utf-8')) for s in docs) / 1e6 + return sum(len(s.encode("utf-8")) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) -print("%d documents - %0.3fMB (training set)" % ( - len(data_train.data), data_train_size_mb)) -print("%d documents - %0.3fMB (test set)" % ( - len(data_test.data), data_test_size_mb)) +print( + "%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb) +) +print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb)) print("%d categories" % len(target_names)) print() @@ -150,12 +170,12 @@ def size_mb(docs): print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: - vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, - n_features=opts.n_features) + vectorizer = HashingVectorizer( + stop_words="english", alternate_sign=False, n_features=opts.n_features + ) X_train = vectorizer.transform(data_train.data) else: - vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, - stop_words='english') + vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english") X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) @@ -177,22 +197,17 @@ def size_mb(docs): feature_names = vectorizer.get_feature_names_out() if opts.select_chi2: - print("Extracting %d best features by a chi-squared test" % - opts.select_chi2) + print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) - if feature_names: + if feature_names is not None: # keep selected feature names - feature_names = [feature_names[i] for i - in ch2.get_support(indices=True)] + feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print("done in %fs" % (time() - t0)) print() -if feature_names: - feature_names = np.asarray(feature_names) - def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" @@ -205,7 +220,7 @@ def trim(s): # We train and test the datasets with 15 different classification models # and get performance results for each model. def benchmark(clf): - print('_' * 80) + print("_" * 80) print("Training: ") print(clf) t0 = time() @@ -221,7 +236,7 @@ def benchmark(clf): score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) - if hasattr(clf, 'coef_'): + if hasattr(clf, "coef_"): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) @@ -234,67 +249,74 @@ def benchmark(clf): if opts.print_report: print("classification report:") - print(metrics.classification_report(y_test, pred, - target_names=target_names)) + print(metrics.classification_report(y_test, pred, target_names=target_names)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() - clf_descr = str(clf).split('(')[0] + clf_descr = str(clf).split("(")[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ( - (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), - (Perceptron(max_iter=50), "Perceptron"), - (PassiveAggressiveClassifier(max_iter=50), - "Passive-Aggressive"), - (KNeighborsClassifier(n_neighbors=10), "kNN"), - (RandomForestClassifier(), "Random forest")): - print('=' * 80) + (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), + (Perceptron(max_iter=50), "Perceptron"), + (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"), + (KNeighborsClassifier(n_neighbors=10), "kNN"), + (RandomForestClassifier(), "Random forest"), +): + print("=" * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: - print('=' * 80) + print("=" * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model - results.append(benchmark(LinearSVC(penalty=penalty, dual=False, - tol=1e-3))) + results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3))) # Train SGD model - results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, - penalty=penalty))) + results.append(benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty -print('=' * 80) +print("=" * 80) print("Elastic-Net penalty") -results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, - penalty="elasticnet"))) +results.append( + benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty="elasticnet")) +) # Train NearestCentroid without threshold -print('=' * 80) +print("=" * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers -print('=' * 80) +print("=" * 80) print("Naive Bayes") -results.append(benchmark(MultinomialNB(alpha=.01))) -results.append(benchmark(BernoulliNB(alpha=.01))) -results.append(benchmark(ComplementNB(alpha=.1))) +results.append(benchmark(MultinomialNB(alpha=0.01))) +results.append(benchmark(BernoulliNB(alpha=0.01))) +results.append(benchmark(ComplementNB(alpha=0.1))) -print('=' * 80) +print("=" * 80) print("LinearSVC with L1-based feature selection") # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. -results.append(benchmark(Pipeline([ - ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, - tol=1e-3))), - ('classification', LinearSVC(penalty="l2"))]))) +results.append( + benchmark( + Pipeline( + [ + ( + "feature_selection", + SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)), + ), + ("classification", LinearSVC(penalty="l2")), + ] + ) + ) +) # %% @@ -312,17 +334,16 @@ def benchmark(clf): plt.figure(figsize=(12, 8)) plt.title("Score") -plt.barh(indices, score, .2, label="score", color='navy') -plt.barh(indices + .3, training_time, .2, label="training time", - color='c') -plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange') +plt.barh(indices, score, 0.2, label="score", color="navy") +plt.barh(indices + 0.3, training_time, 0.2, label="training time", color="c") +plt.barh(indices + 0.6, test_time, 0.2, label="test time", color="darkorange") plt.yticks(()) -plt.legend(loc='best') -plt.subplots_adjust(left=.25) -plt.subplots_adjust(top=.95) -plt.subplots_adjust(bottom=.05) +plt.legend(loc="best") +plt.subplots_adjust(left=0.25) +plt.subplots_adjust(top=0.95) +plt.subplots_adjust(bottom=0.05) for i, c in zip(indices, clf_names): - plt.text(-.3, i, c) + plt.text(-0.3, i, c) plt.show() From 9b8834b4bb433572449848d1cba946fe0d7a633d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 15:38:03 -0400 Subject: [PATCH 081/100] DOC Update example with new api --- examples/text/plot_document_classification_20newsgroups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py index a7c88cd96d296..5351bb5bef3e3 100644 --- a/examples/text/plot_document_classification_20newsgroups.py +++ b/examples/text/plot_document_classification_20newsgroups.py @@ -204,7 +204,7 @@ def size_mb(docs): X_test = ch2.transform(X_test) if feature_names is not None: # keep selected feature names - feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] + feature_names = feature_names[ch2.get_support()] print("done in %fs" % (time() - t0)) print() From 9034a5375419d9daf14e14d755425def6f26a617 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 17:25:01 -0400 Subject: [PATCH 082/100] ENH More consistent input_features checking --- doc/glossary.rst | 2 +- ...ot_document_classification_20newsgroups.py | 205 ++++++++---------- sklearn/base.py | 16 +- sklearn/compose/_column_transformer.py | 12 +- .../compose/tests/test_column_transformer.py | 2 +- sklearn/datasets/descr/twenty_newsgroups.rst | 4 +- .../feature_extraction/_dict_vectorizer.py | 2 +- .../tests/test_dict_vectorizer.py | 1 + sklearn/feature_extraction/tests/test_text.py | 4 + sklearn/feature_extraction/text.py | 8 +- sklearn/feature_selection/_base.py | 21 +- sklearn/pipeline.py | 24 +- sklearn/preprocessing/_data.py | 18 -- sklearn/preprocessing/_discretization.py | 13 +- sklearn/preprocessing/_encoders.py | 47 ++-- sklearn/preprocessing/_polynomial.py | 60 +++-- sklearn/tests/test_common.py | 1 + sklearn/utils/_feature_names.py | 27 --- sklearn/utils/estimator_checks.py | 13 +- .../utils/tests/test_make_feature_names.py | 21 -- sklearn/utils/tests/test_validation.py | 5 + sklearn/utils/validation.py | 33 +++ 22 files changed, 281 insertions(+), 258 deletions(-) delete mode 100644 sklearn/utils/_feature_names.py delete mode 100644 sklearn/utils/tests/test_make_feature_names.py diff --git a/doc/glossary.rst b/doc/glossary.rst index 234ac80c654ff..7714276c9f8eb 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -1269,7 +1269,7 @@ Methods the estimator's :term:`transform` method. It outputs a list of strings and may take a list of strings as input, corresponding to the names of input columns from which output column names can - be generated. If `feature_names_in` is not passed in, then the + be generated. If `input_features` is not passed in, then the `feature_names_in_` attribute will be used. If the `feature_names_in_` attribute is not defined or `None`, then the input names are named x0, x1, ..., xn_features_out. diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py index 5351bb5bef3e3..2a250c5d0b634 100644 --- a/examples/text/plot_document_classification_20newsgroups.py +++ b/examples/text/plot_document_classification_20newsgroups.py @@ -45,60 +45,40 @@ # Display progress logs on stdout -logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s') op = OptionParser() -op.add_option( - "--report", - action="store_true", - dest="print_report", - help="Print a detailed classification report.", -) -op.add_option( - "--chi2_select", - action="store", - type="int", - dest="select_chi2", - help="Select some number of features using a chi-squared test", -) -op.add_option( - "--confusion_matrix", - action="store_true", - dest="print_cm", - help="Print the confusion matrix.", -) -op.add_option( - "--top10", - action="store_true", - dest="print_top10", - help="Print ten most discriminative terms per class for every classifier.", -) -op.add_option( - "--all_categories", - action="store_true", - dest="all_categories", - help="Whether to use all categories or not.", -) -op.add_option("--use_hashing", action="store_true", help="Use a hashing vectorizer.") -op.add_option( - "--n_features", - action="store", - type=int, - default=2 ** 16, - help="n_features when using the hashing vectorizer.", -) -op.add_option( - "--filtered", - action="store_true", - help=( - "Remove newsgroup information that is easily overfit: " - "headers, signatures, and quoting." - ), -) +op.add_option("--report", + action="store_true", dest="print_report", + help="Print a detailed classification report.") +op.add_option("--chi2_select", + action="store", type="int", dest="select_chi2", + help="Select some number of features using a chi-squared test") +op.add_option("--confusion_matrix", + action="store_true", dest="print_cm", + help="Print the confusion matrix.") +op.add_option("--top10", + action="store_true", dest="print_top10", + help="Print ten most discriminative terms per class" + " for every classifier.") +op.add_option("--all_categories", + action="store_true", dest="all_categories", + help="Whether to use all categories or not.") +op.add_option("--use_hashing", + action="store_true", + help="Use a hashing vectorizer.") +op.add_option("--n_features", + action="store", type=int, default=2 ** 16, + help="n_features when using the hashing vectorizer.") +op.add_option("--filtered", + action="store_true", + help="Remove newsgroup information that is easily overfit: " + "headers, signatures, and quoting.") def is_interactive(): - return not hasattr(sys.modules["__main__"], "__file__") + return not hasattr(sys.modules['__main__'], '__file__') # work-around for Jupyter notebook and IPython console @@ -123,44 +103,44 @@ def is_interactive(): categories = None else: categories = [ - "alt.atheism", - "talk.religion.misc", - "comp.graphics", - "sci.space", + 'alt.atheism', + 'talk.religion.misc', + 'comp.graphics', + 'sci.space', ] if opts.filtered: - remove = ("headers", "footers", "quotes") + remove = ('headers', 'footers', 'quotes') else: remove = () print("Loading 20 newsgroups dataset for categories:") print(categories if categories else "all") -data_train = fetch_20newsgroups( - subset="train", categories=categories, shuffle=True, random_state=42, remove=remove -) +data_train = fetch_20newsgroups(subset='train', categories=categories, + shuffle=True, random_state=42, + remove=remove) -data_test = fetch_20newsgroups( - subset="test", categories=categories, shuffle=True, random_state=42, remove=remove -) -print("data loaded") +data_test = fetch_20newsgroups(subset='test', categories=categories, + shuffle=True, random_state=42, + remove=remove) +print('data loaded') # order of labels in `target_names` can be different from `categories` target_names = data_train.target_names def size_mb(docs): - return sum(len(s.encode("utf-8")) for s in docs) / 1e6 + return sum(len(s.encode('utf-8')) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) -print( - "%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb) -) -print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb)) +print("%d documents - %0.3fMB (training set)" % ( + len(data_train.data), data_train_size_mb)) +print("%d documents - %0.3fMB (test set)" % ( + len(data_test.data), data_test_size_mb)) print("%d categories" % len(target_names)) print() @@ -170,12 +150,12 @@ def size_mb(docs): print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: - vectorizer = HashingVectorizer( - stop_words="english", alternate_sign=False, n_features=opts.n_features - ) + vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, + n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: - vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english") + vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, + stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) @@ -197,7 +177,8 @@ def size_mb(docs): feature_names = vectorizer.get_feature_names_out() if opts.select_chi2: - print("Extracting %d best features by a chi-squared test" % opts.select_chi2) + print("Extracting %d best features by a chi-squared test" % + opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) @@ -236,7 +217,7 @@ def benchmark(clf): score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) - if hasattr(clf, "coef_"): + if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) @@ -249,74 +230,67 @@ def benchmark(clf): if opts.print_report: print("classification report:") - print(metrics.classification_report(y_test, pred, target_names=target_names)) + print(metrics.classification_report(y_test, pred, + target_names=target_names)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() - clf_descr = str(clf).split("(")[0] + clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ( - (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), - (Perceptron(max_iter=50), "Perceptron"), - (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"), - (KNeighborsClassifier(n_neighbors=10), "kNN"), - (RandomForestClassifier(), "Random forest"), -): - print("=" * 80) + (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), + (Perceptron(max_iter=50), "Perceptron"), + (PassiveAggressiveClassifier(max_iter=50), + "Passive-Aggressive"), + (KNeighborsClassifier(n_neighbors=10), "kNN"), + (RandomForestClassifier(), "Random forest")): + print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: - print("=" * 80) + print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model - results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3))) + results.append(benchmark(LinearSVC(penalty=penalty, dual=False, + tol=1e-3))) # Train SGD model - results.append(benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty=penalty))) + results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, + penalty=penalty))) # Train SGD with Elastic Net penalty -print("=" * 80) +print('=' * 80) print("Elastic-Net penalty") -results.append( - benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty="elasticnet")) -) +results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, + penalty="elasticnet"))) # Train NearestCentroid without threshold -print("=" * 80) +print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers -print("=" * 80) +print('=' * 80) print("Naive Bayes") -results.append(benchmark(MultinomialNB(alpha=0.01))) -results.append(benchmark(BernoulliNB(alpha=0.01))) -results.append(benchmark(ComplementNB(alpha=0.1))) +results.append(benchmark(MultinomialNB(alpha=.01))) +results.append(benchmark(BernoulliNB(alpha=.01))) +results.append(benchmark(ComplementNB(alpha=.1))) -print("=" * 80) +print('=' * 80) print("LinearSVC with L1-based feature selection") # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. -results.append( - benchmark( - Pipeline( - [ - ( - "feature_selection", - SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)), - ), - ("classification", LinearSVC(penalty="l2")), - ] - ) - ) -) +results.append(benchmark(Pipeline([ + ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, + tol=1e-3))), + ('classification', LinearSVC(penalty="l2"))]))) # %% @@ -334,16 +308,17 @@ def benchmark(clf): plt.figure(figsize=(12, 8)) plt.title("Score") -plt.barh(indices, score, 0.2, label="score", color="navy") -plt.barh(indices + 0.3, training_time, 0.2, label="training time", color="c") -plt.barh(indices + 0.6, test_time, 0.2, label="test time", color="darkorange") +plt.barh(indices, score, .2, label="score", color='navy') +plt.barh(indices + .3, training_time, .2, label="training time", + color='c') +plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange') plt.yticks(()) -plt.legend(loc="best") -plt.subplots_adjust(left=0.25) -plt.subplots_adjust(top=0.95) -plt.subplots_adjust(bottom=0.05) +plt.legend(loc='best') +plt.subplots_adjust(left=.25) +plt.subplots_adjust(top=.95) +plt.subplots_adjust(bottom=.05) for i, c in zip(indices, clf_names): - plt.text(-0.3, i, c) + plt.text(-.3, i, c) plt.show() diff --git a/sklearn/base.py b/sklearn/base.py index 6639d8f1e42ea..2803299326fda 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -21,8 +21,7 @@ ) from .utils.validation import check_X_y from .utils.validation import check_array -from .utils._feature_names import _make_feature_names -from .utils.validation import _check_y +from .utils.validation import _check_y, _make_feature_names_in from .utils.validation import _num_features from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _get_feature_names @@ -913,17 +912,20 @@ def get_feature_names_out(self, input_features=None): Parameters ---------- input_features : array-like of str or None, default=None - Input features. If None, they are generated as - `[x0, x1, ..., xn_features]`. + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. Returns ------- feature_names_out : ndarray of str Transformed feature names. """ - if input_features is None and hasattr(self, "feature_names_in_"): - input_features = self.feature_names_in_ - return _make_feature_names(self.n_features_in_, input_features=input_features) + return _make_feature_names_in(self, input_features) class MetaEstimatorMixin: diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 3304e68413a5a..a8b474a3964e2 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -22,9 +22,8 @@ from ..utils import _safe_indexing from ..utils import _get_column_indices from ..utils.deprecation import deprecated -from ..utils._feature_names import _make_feature_names from ..utils.metaestimators import _BaseComposition -from ..utils.validation import check_array, check_is_fitted +from ..utils.validation import check_array, check_is_fitted, _make_feature_names_in from ..utils.fixes import delayed @@ -491,16 +490,13 @@ def get_feature_names_out(self, input_features=None): Transformed feature names. """ check_is_fitted(self) - if hasattr(self, "feature_names_in_"): - feature_names_in = self.feature_names_in_ - else: - feature_names_in = _make_feature_names(self.n_features_in_) + input_features = _make_feature_names_in(self, input_features) # List of tuples (name, feature_names_out) transformer_with_feature_names_out = [] for name, trans, column, _ in self._iter(fitted=True): feature_names_out = self._get_feature_name_out_for_transformer( - name, trans, column, feature_names_in + name, trans, column, input_features ) if feature_names_out is None: continue @@ -542,7 +538,7 @@ def get_feature_names_out(self, input_features=None): ) return np.concatenate( - [name for _, name in transformer_with_feature_names_out], dtype=object + [name for _, name in transformer_with_feature_names_out], ) def _update_fitted_transformers(self, transformers): diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 8f024cd8e6c6f..82b2b668b4a67 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1687,7 +1687,7 @@ def __init__(self, feature_names_out=None): def get_feature_names_out(self, input_features=None): if self.feature_names_out is not None: - return self.feature_names_out + return np.asarray(self.feature_names_out, dtype=object) return input_features diff --git a/sklearn/datasets/descr/twenty_newsgroups.rst b/sklearn/datasets/descr/twenty_newsgroups.rst index 3814fb97d1e42..0acb2c8191905 100644 --- a/sklearn/datasets/descr/twenty_newsgroups.rst +++ b/sklearn/datasets/descr/twenty_newsgroups.rst @@ -116,7 +116,7 @@ components by sample in a more than 30000-dimensional space >>> vectors.nnz / float(vectors.shape[0]) 159.01327... -:func:`sklearn.datasets.fetch_20newsgroups_vectorized` is a function which +:func:`sklearn.datasets.fetch_20newsgroups_vectorized` is a function which returns ready-to-use token counts features instead of file names. .. _`20 newsgroups website`: http://people.csail.mit.edu/jrennie/20Newsgroups/ @@ -156,7 +156,7 @@ Let's take a look at what the most informative features are: >>> import numpy as np >>> def show_top10(classifier, vectorizer, categories): - ... feature_names = np.asarray(vectorizer.get_feature_names_out()) + ... feature_names = vectorizer.get_feature_names_out() ... for i, category in enumerate(categories): ... top10 = np.argsort(classifier.coef_[i])[-10:] ... print("%s: %s" % (category, " ".join(feature_names[top10]))) diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 1148b4333736d..042a85fefb518 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -400,7 +400,7 @@ def get_feature_names_out(self, input_features=None): feature_names = [str(name) for name in self.feature_names_] else: feature_names = self.feature_names_ - return np.array(feature_names) + return np.asarray(feature_names, dtype=object) def restrict(self, support, indices=False): """Restrict the features to those in support using feature selection. diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 0e7b9513865ad..ebdb0f084e67d 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -258,4 +258,5 @@ def test_dict_vectorizer_get_feature_names_out(): feature_names = dv.get_feature_names_out() assert isinstance(feature_names, np.ndarray) + assert feature_names.dtype == object assert_array_equal(feature_names, ["1", "2", "3"]) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index abb408669c0c6..ed9bff59db952 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -668,6 +668,10 @@ def test_feature_names(get_names): feature_names = getattr(cv, get_names)() if get_names == "get_feature_names_out": assert isinstance(feature_names, np.ndarray) + assert feature_names.dtype == object + else: + # get_feature_names + assert isinstance(feature_names, list) assert len(feature_names) == n_features assert_array_equal( diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 77312427a15ae..8d20488f72bca 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1400,7 +1400,9 @@ def get_feature_names(self): feature_names : list A list of feature names. """ - return self.get_feature_names_out() + self._check_vocabulary() + + return [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))] def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. @@ -1416,9 +1418,9 @@ def get_feature_names_out(self, input_features=None): Transformed feature names. """ self._check_vocabulary() - return np.asarray( - [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))] + [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))], + dtype=object, ) def _more_tags(self): diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index d01f660293501..6b178ced6011c 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -18,7 +18,7 @@ safe_sqr, ) from ..utils._tags import _safe_tags -from ..utils._feature_names import _make_feature_names +from ..utils.validation import _make_feature_names_in class SelectorMixin(TransformerMixin, metaclass=ABCMeta): @@ -145,22 +145,21 @@ def get_feature_names_out(self, input_features=None): Parameters ---------- input_features : array-like of str or None, default=None - Input features. If None, they are generated as - `[x0, x1, ..., xn_features]`. + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. Returns ------- feature_names_out : ndarray of str Transformed feature names. """ - if input_features is None and hasattr(self, "feature_names_in_"): - input_features = self.feature_names_in_ - - mask = self.get_support() - input_features = _make_feature_names( - mask.shape[0], input_features=input_features - ) - return input_features[mask] + input_features = _make_feature_names_in(self, input_features) + return input_features[self.get_support()] def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1): diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index f95dc39e07317..549fc3bc08aad 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -27,6 +27,7 @@ from .utils._tags import _safe_tags from .utils.validation import check_memory from .utils.validation import check_is_fitted +from .utils.validation import _make_feature_names_in from .utils.fixes import delayed from .exceptions import NotFittedError @@ -677,20 +678,25 @@ def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. Transform input features using the pipeline. - If the last step is a transformer, it's included - in the transformation, otherwise it's not. Parameters ---------- input_features : array-like of str or None, default=None Input features. + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + If `n_features_in_` is not defined, then `None` is used. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + Returns ------- feature_names_out : ndarray of str Transformed feature names. """ - feature_names = input_features + feature_names = _make_feature_names_in(self, input_features) for _, name, transform in self._iter(): if not hasattr(transform, "get_feature_names_out"): raise AttributeError( @@ -1036,11 +1042,19 @@ def get_feature_names_out(self, input_features=None): input_features : array-like of str or None, default=None Input features. + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + If `n_features_in_` is not defined, then `None` is used. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + Returns ------- feature_names_out : ndarray of str Transformed feature names. """ + input_features = _make_feature_names_in(self, input_features) feature_names = [] for name, trans, _ in self._iter(): if not hasattr(trans, "get_feature_names_out"): @@ -1049,9 +1063,9 @@ def get_feature_names_out(self, input_features=None): % (str(name), type(trans).__name__) ) feature_names.extend( - [name + "__" + f for f in trans.get_feature_names_out(input_features)] + [f"{name}__{f}" for f in trans.get_feature_names_out(input_features)] ) - return np.asarray(feature_names) + return np.asarray(feature_names, dtype=object) def fit(self, X, y=None, **fit_params): """Fit all transformers using X. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index d869375b43a5c..98a7658905809 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -37,7 +37,6 @@ _check_sample_weight, FLOAT_DTYPES, ) -from ..utils._feature_names import _make_feature_names from ._encoders import OneHotEncoder @@ -2276,23 +2275,6 @@ def transform(self, K, copy=True): return K - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. - - Returns - ------- - feature_names_out : ndarray of str - Transformed feature names. - """ - return _make_feature_names( - n_features=self.K_fit_rows_.shape[0], prefix=type(self).__name__.lower() - ) - def _more_tags(self): return {"pairwise": True} diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 9d716afd6ed35..d60eb97562195 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -15,6 +15,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils.validation import check_array from ..utils.validation import check_is_fitted +from ..utils.validation import _make_feature_names_in class KBinsDiscretizer(TransformerMixin, BaseEstimator): @@ -383,14 +384,18 @@ def get_feature_names_out(self, input_features=None): Parameters ---------- input_features : array-like of str or None, default=None - Input features. If None, they are generated as - `[x0, x1, ..., xn_features]`. + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. Returns ------- feature_names_out : ndarray of str Transformed feature names. """ - if input_features is None and hasattr(self, "feature_names_in_"): - input_features = self.feature_names_in_ + input_features = _make_feature_names_in(self, input_features) return self._encoder.get_feature_names_out(input_features) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index f4099a26bbf90..bae95677366f3 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -11,6 +11,7 @@ from ..utils import check_array, is_scalar_nan from ..utils.deprecation import deprecated from ..utils.validation import check_is_fitted +from ..utils.validation import _make_feature_names_in from ..utils._mask import _get_mask from ..utils._encode import _encode, _check_unknown, _unique @@ -71,9 +72,11 @@ def _get_feature(self, X, feature_idx): return X[:, feature_idx] def _fit(self, X, handle_unknown="error", force_all_finite=True): + self._check_feature_names(X, reset=True) X_list, n_samples, n_features = self._check_X( X, force_all_finite=force_all_finite ) + self.n_features_in_ = n_features if self.categories != "auto": if len(self.categories) != n_features: @@ -115,6 +118,7 @@ def _fit(self, X, handle_unknown="error", force_all_finite=True): def _transform( self, X, handle_unknown="error", force_all_finite=True, warn_on_unknown=False ): + self._check_feature_names(X, reset=False) X_list, n_samples, n_features = self._check_X( X, force_all_finite=force_all_finite ) @@ -284,6 +288,11 @@ class OneHotEncoder(_BaseEncoder): .. versionchanged:: 0.23 Added the possibility to contain `None` values. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 1.0 + feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. @@ -679,7 +688,25 @@ def get_feature_names(self, input_features=None): output_feature_names : ndarray of shape (n_output_features,) Array of feature names. """ - feature_names = self.get_feature_names_out(input_features) + check_is_fitted(self) + cats = self.categories_ + if input_features is None: + input_features = ["x%d" % i for i in range(len(cats))] + elif len(input_features) != len(self.categories_): + raise ValueError( + "input_features should have length equal to number of " + "features ({}), got {}".format( + len(self.categories_), len(input_features) + ) + ) + + feature_names = [] + for i in range(len(cats)): + names = [input_features[i] + "_" + str(t) for t in cats[i]] + if self.drop_idx_ is not None and self.drop_idx_[i] is not None: + names.pop(self.drop_idx_[i]) + feature_names.extend(names) + return np.array(feature_names, dtype=object) def get_feature_names_out(self, input_features=None): @@ -701,18 +728,7 @@ def get_feature_names_out(self, input_features=None): """ check_is_fitted(self) cats = self.categories_ - if input_features is None: - if hasattr(self, "feature_names_in_"): - input_features = self.feature_names_in_ - else: - input_features = ["x%d" % i for i in range(len(cats))] - elif len(input_features) != len(self.categories_): - raise ValueError( - "input_features should have length equal to number of " - "features ({}), got {}".format( - len(self.categories_), len(input_features) - ) - ) + input_features = _make_feature_names_in(self, input_features) feature_names = [] for i in range(len(cats)): @@ -776,6 +792,11 @@ class OrdinalEncoder(_BaseEncoder): the features in X and corresponding with the output of ``transform``). This does not include categories that weren't seen during ``fit``. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 1.0 + feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 4e8b481b1a174..3b01725f395e5 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -15,8 +15,8 @@ from ..utils import check_array from ..utils.deprecation import deprecated from ..utils.fixes import linspace -from ..utils._feature_names import _make_feature_names from ..utils.validation import check_is_fitted, FLOAT_DTYPES, _check_sample_weight +from ..utils.validation import _make_feature_names_in from ..utils.stats import _weighted_percentile from ._csr_polynomial_expansion import _csr_polynomial_expansion @@ -212,7 +212,23 @@ def get_feature_names(self, input_features=None): ------- output_feature_names : list of str of shape (n_output_features,) """ - return self.get_feature_names_out(input_features).tolist() + powers = self.powers_ + if input_features is None: + input_features = ["x%d" % i for i in range(powers.shape[1])] + feature_names = [] + for row in powers: + inds = np.where(row)[0] + if len(inds): + name = " ".join( + "%s^%d" % (input_features[ind], exp) + if exp != 1 + else input_features[ind] + for ind, exp in zip(inds, row[inds]) + ) + else: + name = "1" + feature_names.append(name) + return feature_names def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. @@ -220,8 +236,13 @@ def get_feature_names_out(self, input_features=None): Parameters ---------- input_features : array-like of str or None, default=None - Input features. If None, they are generated as - `[x0, x1, ..., xn_features]`. + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. Returns ------- @@ -229,12 +250,7 @@ def get_feature_names_out(self, input_features=None): Transformed feature names. """ powers = self.powers_ - - if input_features is None and hasattr(self, "feature_names_in_"): - input_features = self.feature_names_in_ - input_features = _make_feature_names( - n_features=powers.shape[1], input_features=input_features - ) + input_features = _make_feature_names_in(self, input_features) feature_names = [] for row in powers: inds = np.where(row)[0] @@ -679,7 +695,13 @@ def get_feature_names(self, input_features=None): ------- output_feature_names : list of str of shape (n_output_features,) """ - return self.get_feature_names_out(input_features).tolist() + n_splines = self.bsplines_[0].c.shape[0] + input_features = _make_feature_names_in(self, input_features) + feature_names = [] + for i in range(self.n_features_in_): + for j in range(n_splines - 1 + self.include_bias): + feature_names.append(f"{input_features[i]}_sp_{j}") + return feature_names def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. @@ -687,8 +709,13 @@ def get_feature_names_out(self, input_features=None): Parameters ---------- input_features : array-like of str or None, default=None - Input features. If None, they are generated as - `[x0, x1, ..., xn_features]`. + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. Returns ------- @@ -696,12 +723,7 @@ def get_feature_names_out(self, input_features=None): Transformed feature names. """ n_splines = self.bsplines_[0].c.shape[0] - if input_features is None: - if hasattr(self, "feature_names_in_"): - input_features = self.feature_names_in_ - else: - input_features = ["x%d" % i for i in range(self.n_features_in_)] - + input_features = _make_feature_names_in(self, input_features) feature_names = [] for i in range(self.n_features_in_): for j in range(n_splines - 1 + self.include_bias): diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index f1ed7f1fb6193..591514380f9ef 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -347,6 +347,7 @@ def test_pandas_column_name_consistency(estimator): "impute", "isotonic", "kernel_approximation", + "preprocessing", "manifold", "neighbors", "neural_network", diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py deleted file mode 100644 index 877630d8ff41f..0000000000000 --- a/sklearn/utils/_feature_names.py +++ /dev/null @@ -1,27 +0,0 @@ -import numpy as np - - -def _make_feature_names(n_features, prefix="x", input_features=None): - """Make feature name strings from n_features. - - Either returns input_feature names if it is not None, or creates - placeholder names based on n_features, by default, - ['x0', 'x1', ..., 'xn_features'] is generated. - - Parameters - ---------- - n_features : int - Number of feature names to generate. - prefix : str, default='x' - Prefix for each feature name. - input_features : array-like of str - Optional existing input features, returned unchanged if not None. - - Returns - ------- - feature_names : ndarray of str - Generated feature names of length n_features. - """ - if input_features is not None: - return np.asarray(input_features) - return np.asarray([f"{prefix}{i}" for i in range(n_features)]) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 338a6bb205e64..8c6157e9ee784 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3857,8 +3857,12 @@ def check_transformer_get_feature_names_out(name, transformer_orig): y_[::2, 1] *= 2 X_transform = transformer.fit_transform(X, y=y_) + input_features = [f"feature{i}" for i in range(n_features)] + + # input_features names is not the same length as n_features_in_ + with raises(ValueError, match="input_features should have length equal"): + transformer.get_feature_names_out(input_features[::2]) - input_features = ["feature%d" % i for i in range(n_features)] feature_names_out = transformer.get_feature_names_out(input_features) assert feature_names_out is not None assert isinstance(feature_names_out, np.ndarray) @@ -3908,10 +3912,15 @@ def check_transformer_get_feature_names_out_pandas(name, transformer_orig): y_ = np.c_[np.asarray(y), np.asarray(y)] y_[::2, 1] *= 2 - feature_names_in = ["col%d" % i for i in range(n_features)] + feature_names_in = [f"col{i}" for i in range(n_features)] df = pd.DataFrame(X, columns=feature_names_in) X_transform = transformer.fit_transform(df, y=y_) + # error is raised when `input_features` do not match feature_names_in + invalid_feature_names = [f"bad{i}" for i in range(n_features)] + with raises(ValueError, match="input_features is not equal to feature_names_in_"): + transformer.get_feature_names_out(invalid_feature_names) + feature_names_out_default = transformer.get_feature_names_out() feature_names_in_explicit_names = transformer.get_feature_names_out( feature_names_in diff --git a/sklearn/utils/tests/test_make_feature_names.py b/sklearn/utils/tests/test_make_feature_names.py deleted file mode 100644 index 99904ddbec138..0000000000000 --- a/sklearn/utils/tests/test_make_feature_names.py +++ /dev/null @@ -1,21 +0,0 @@ -import pytest - -import numpy as np -from numpy.testing import assert_array_equal -from sklearn.utils._feature_names import _make_feature_names - - -@pytest.mark.parametrize( - "n_features, prefix, input_features, expected_names", - [ - (3, "x", None, ["x0", "x1", "x2"]), - (4, "x", ["cat", "dog", "snake"], ["cat", "dog", "snake"]), - (4, "pca", None, ["pca0", "pca1", "pca2", "pca3"]), - ], -) -def test_make_feature_names(n_features, prefix, input_features, expected_names): - feature_names = _make_feature_names( - n_features=n_features, prefix=prefix, input_features=input_features - ) - assert isinstance(feature_names, np.ndarray) - assert_array_equal(expected_names, feature_names) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 45c24d696cd40..a279e85450160 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1562,3 +1562,8 @@ def test_get_feature_names_invalid_dtypes_warns(names, dtypes): with pytest.warns(FutureWarning, match=msg): names = _get_feature_names(X) assert names is None + + +def test_make_feature_names_in(): + """Check make_features_names_in""" + pass diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 4dd4556c6507c..7b696ea862842 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1668,3 +1668,36 @@ def _get_feature_names(X): # Only feature names of all strings are supported if types[0] == "str": return feature_names + + +def _make_feature_names_in(estimator, input_features): + """Make feature_names_in_ from estimator. + + Used to validate input from `get_feature_names_out(input_features=None)`. + """ + feature_names_in_ = getattr(estimator, "feature_names_in_", None) + if input_features is not None: + input_features = np.asarray(input_features, dtype=object) + if feature_names_in_ is not None and not np.array_equal( + feature_names_in_, input_features + ): + raise ValueError("input_features is not equal to feature_names_in_") + + if ( + hasattr(estimator, "n_features_in_") + and len(input_features) != estimator.n_features_in_ + ): + raise ValueError( + "input_features should have length equal to number of " + f"features ({estimator.n_features_in_}), got {len(input_features)}" + ) + return input_features + + if feature_names_in_ is not None: + return feature_names_in_ + + # Only generates features if `n_features_in_` is defined + if hasattr(estimator, "n_features_in_"): + return np.asarray( + [f"x{i}" for i in range(estimator.n_features_in_)], dtype=object + ) From 9081ebd065a25b51ad22689ee50916ff1565fdc6 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 17:26:32 -0400 Subject: [PATCH 083/100] WIP Better --- examples/text/plot_document_classification_20newsgroups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py index 2a250c5d0b634..7f7bc422808dc 100644 --- a/examples/text/plot_document_classification_20newsgroups.py +++ b/examples/text/plot_document_classification_20newsgroups.py @@ -201,7 +201,7 @@ def trim(s): # We train and test the datasets with 15 different classification models # and get performance results for each model. def benchmark(clf): - print("_" * 80) + print('_' * 80) print("Training: ") print(clf) t0 = time() From a4ce56758ba019ae1eefc039b4566b71e077b9f0 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 17:30:03 -0400 Subject: [PATCH 084/100] ENH Add prefix_features_names_out to make_column_transformer --- sklearn/base.py | 34 +------------------------- sklearn/compose/_column_transformer.py | 16 +++++++++++- sklearn/preprocessing/_data.py | 18 +++++++------- 3 files changed, 25 insertions(+), 43 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 2803299326fda..a585b2b06c394 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -21,7 +21,7 @@ ) from .utils.validation import check_X_y from .utils.validation import check_array -from .utils.validation import _check_y, _make_feature_names_in +from .utils.validation import _check_y from .utils.validation import _num_features from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _get_feature_names @@ -896,38 +896,6 @@ def fit_predict(self, X, y=None): return self.fit(X).predict(X) -class OneToOneFeatureMixin: - """Provides `get_feature_names_out` for simple transformers. - - Assumes there's a 1-to-1 correspondence between input features - and output features. - """ - - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Returns `input_features` as this transformation doesn't add or drop - features. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Input features. - - - If `input_features` is `None`, then `feature_names_in_` is - used as feature names in. If `feature_names_in_` is not defined, - then names are generated: `[x0, x1, ..., x(n_features_in_)]`. - - If `input_features` is an array-like, then `input_features` must - match `feature_names_in_` if `feature_names_in_` is defined. - - Returns - ------- - feature_names_out : ndarray of str - Transformed feature names. - """ - return _make_feature_names_in(self, input_features) - - class MetaEstimatorMixin: _required_parameters = ["estimator"] """Mixin class for all meta estimators in scikit-learn.""" diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index a8b474a3964e2..7a9facfac5543 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -868,7 +868,12 @@ def _get_transformer_list(estimators): def make_column_transformer( - *transformers, remainder="drop", sparse_threshold=0.3, n_jobs=None, verbose=False + *transformers, + remainder="drop", + sparse_threshold=0.3, + n_jobs=None, + verbose=False, + prefix_feature_names_out=True, ): """Construct a ColumnTransformer from the given transformers. @@ -931,6 +936,14 @@ def make_column_transformer( If True, the time elapsed while fitting each transformer will be printed as it is completed. + prefix_feature_names_out : bool, default=True + If True, :meth:`get_feature_names_out` will prefix all feature names + with the name of the transformer that generated that feature. + If False, :meth:`get_feature_names_out` will not prefix any feature + names and will error if feature names collide. + + .. versionadded:: 1.0 + Returns ------- ct : ColumnTransformer @@ -963,6 +976,7 @@ def make_column_transformer( remainder=remainder, sparse_threshold=sparse_threshold, verbose=verbose, + prefix_feature_names_out=prefix_feature_names_out, ) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 98a7658905809..f7cfe3e023783 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -16,7 +16,7 @@ from scipy import optimize from scipy.special import boxcox -from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin +from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.deprecation import deprecated from ..utils.extmath import row_norms @@ -262,7 +262,7 @@ def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): return X -class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): +class MinMaxScaler(TransformerMixin, BaseEstimator): """Transform features by scaling each feature to a given range. This estimator scales and translates each feature individually such @@ -631,7 +631,7 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): return X -class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): +class StandardScaler(TransformerMixin, BaseEstimator): """Standardize features by removing the mean and scaling to unit variance. The standard score of a sample `x` is calculated as: @@ -1046,7 +1046,7 @@ def _more_tags(self): return {"allow_nan": True, "preserves_dtype": [np.float64, np.float32]} -class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): +class MaxAbsScaler(TransformerMixin, BaseEstimator): """Scale each feature by its maximum absolute value. This estimator scales and translates each feature individually such @@ -1348,7 +1348,7 @@ def maxabs_scale(X, *, axis=0, copy=True): return X -class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): +class RobustScaler(TransformerMixin, BaseEstimator): """Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to @@ -1841,7 +1841,7 @@ def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False): return X -class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): +class Normalizer(TransformerMixin, BaseEstimator): """Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one @@ -2012,7 +2012,7 @@ def binarize(X, *, threshold=0.0, copy=True): return X -class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): +class Binarizer(TransformerMixin, BaseEstimator): """Binarize data (set feature values to 0 or 1) according to a threshold. Values greater than the threshold map to 1, while values less than @@ -2346,7 +2346,7 @@ def add_dummy_feature(X, value=1.0): return np.hstack((np.full((n_samples, 1), value), X)) -class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): +class QuantileTransformer(TransformerMixin, BaseEstimator): """Transform features using quantiles information. This method transforms the features to follow a uniform or a normal @@ -2931,7 +2931,7 @@ def quantile_transform( ) -class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): +class PowerTransformer(TransformerMixin, BaseEstimator): """Apply a power transform featurewise to make data more Gaussian-like. Power transforms are a family of parametric, monotonic transformations From 12a2052563e983739a9b2e99779bbd3a18449807 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 17:55:39 -0400 Subject: [PATCH 085/100] ENH Use in one example --- .../plot_linear_model_coefficient_interpretation.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 2736e358f7871..89c27110e5d16 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -133,7 +133,9 @@ numerical_columns = ["EDUCATION", "EXPERIENCE", "AGE"] preprocessor = make_column_transformer( - (OneHotEncoder(drop="if_binary"), categorical_columns), remainder="passthrough" + (OneHotEncoder(drop="if_binary"), categorical_columns), + remainder="passthrough", + prefix_feature_names_out=False, ) # %% @@ -199,13 +201,7 @@ # # First of all, we can take a look to the values of the coefficients of the # regressor we have fitted. - -feature_names = ( - model.named_steps["columntransformer"] - .named_transformers_["onehotencoder"] - .get_feature_names_out(input_features=categorical_columns) -) -feature_names = np.concatenate([feature_names, numerical_columns]) +feature_names = model[:-1].get_feature_names_out() coefs = pd.DataFrame( model.named_steps["transformedtargetregressor"].regressor_.coef_, From aece40262e1995f60a58f3d56664f857af589ff5 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 17:57:36 -0400 Subject: [PATCH 086/100] REV Remove --- sklearn/utils/tests/test_validation.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index a279e85450160..45c24d696cd40 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1562,8 +1562,3 @@ def test_get_feature_names_invalid_dtypes_warns(names, dtypes): with pytest.warns(FutureWarning, match=msg): names = _get_feature_names(X) assert names is None - - -def test_make_feature_names_in(): - """Check make_features_names_in""" - pass From ea31c18f8012afa59dd5b5ec52c392da98a19d1e Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 17:58:30 -0400 Subject: [PATCH 087/100] CLN Adjust name --- sklearn/compose/_column_transformer.py | 4 ++-- sklearn/feature_selection/_base.py | 4 ++-- sklearn/pipeline.py | 6 +++--- sklearn/preprocessing/_discretization.py | 4 ++-- sklearn/preprocessing/_encoders.py | 4 ++-- sklearn/preprocessing/_polynomial.py | 8 ++++---- sklearn/utils/validation.py | 2 +- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 7a9facfac5543..f09a09cc0d97c 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -23,7 +23,7 @@ from ..utils import _get_column_indices from ..utils.deprecation import deprecated from ..utils.metaestimators import _BaseComposition -from ..utils.validation import check_array, check_is_fitted, _make_feature_names_in +from ..utils.validation import check_array, check_is_fitted, _check_feature_names_in from ..utils.fixes import delayed @@ -490,7 +490,7 @@ def get_feature_names_out(self, input_features=None): Transformed feature names. """ check_is_fitted(self) - input_features = _make_feature_names_in(self, input_features) + input_features = _check_feature_names_in(self, input_features) # List of tuples (name, feature_names_out) transformer_with_feature_names_out = [] diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 6b178ced6011c..c1eb23bcae852 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -18,7 +18,7 @@ safe_sqr, ) from ..utils._tags import _safe_tags -from ..utils.validation import _make_feature_names_in +from ..utils.validation import _check_feature_names_in class SelectorMixin(TransformerMixin, metaclass=ABCMeta): @@ -158,7 +158,7 @@ def get_feature_names_out(self, input_features=None): feature_names_out : ndarray of str Transformed feature names. """ - input_features = _make_feature_names_in(self, input_features) + input_features = _check_feature_names_in(self, input_features) return input_features[self.get_support()] diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 549fc3bc08aad..27bfe0fcf1db4 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -27,7 +27,7 @@ from .utils._tags import _safe_tags from .utils.validation import check_memory from .utils.validation import check_is_fitted -from .utils.validation import _make_feature_names_in +from .utils.validation import _check_feature_names_in from .utils.fixes import delayed from .exceptions import NotFittedError @@ -696,7 +696,7 @@ def get_feature_names_out(self, input_features=None): feature_names_out : ndarray of str Transformed feature names. """ - feature_names = _make_feature_names_in(self, input_features) + feature_names = _check_feature_names_in(self, input_features) for _, name, transform in self._iter(): if not hasattr(transform, "get_feature_names_out"): raise AttributeError( @@ -1054,7 +1054,7 @@ def get_feature_names_out(self, input_features=None): feature_names_out : ndarray of str Transformed feature names. """ - input_features = _make_feature_names_in(self, input_features) + input_features = _check_feature_names_in(self, input_features) feature_names = [] for name, trans, _ in self._iter(): if not hasattr(trans, "get_feature_names_out"): diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index d60eb97562195..17fc854c1a8dc 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -15,7 +15,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils.validation import check_array from ..utils.validation import check_is_fitted -from ..utils.validation import _make_feature_names_in +from ..utils.validation import _check_feature_names_in class KBinsDiscretizer(TransformerMixin, BaseEstimator): @@ -397,5 +397,5 @@ def get_feature_names_out(self, input_features=None): feature_names_out : ndarray of str Transformed feature names. """ - input_features = _make_feature_names_in(self, input_features) + input_features = _check_feature_names_in(self, input_features) return self._encoder.get_feature_names_out(input_features) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index bae95677366f3..ab1c010e68429 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -11,7 +11,7 @@ from ..utils import check_array, is_scalar_nan from ..utils.deprecation import deprecated from ..utils.validation import check_is_fitted -from ..utils.validation import _make_feature_names_in +from ..utils.validation import _check_feature_names_in from ..utils._mask import _get_mask from ..utils._encode import _encode, _check_unknown, _unique @@ -728,7 +728,7 @@ def get_feature_names_out(self, input_features=None): """ check_is_fitted(self) cats = self.categories_ - input_features = _make_feature_names_in(self, input_features) + input_features = _check_feature_names_in(self, input_features) feature_names = [] for i in range(len(cats)): diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 3b01725f395e5..b0aef94cde1f1 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -16,7 +16,7 @@ from ..utils.deprecation import deprecated from ..utils.fixes import linspace from ..utils.validation import check_is_fitted, FLOAT_DTYPES, _check_sample_weight -from ..utils.validation import _make_feature_names_in +from ..utils.validation import _check_feature_names_in from ..utils.stats import _weighted_percentile from ._csr_polynomial_expansion import _csr_polynomial_expansion @@ -250,7 +250,7 @@ def get_feature_names_out(self, input_features=None): Transformed feature names. """ powers = self.powers_ - input_features = _make_feature_names_in(self, input_features) + input_features = _check_feature_names_in(self, input_features) feature_names = [] for row in powers: inds = np.where(row)[0] @@ -696,7 +696,7 @@ def get_feature_names(self, input_features=None): output_feature_names : list of str of shape (n_output_features,) """ n_splines = self.bsplines_[0].c.shape[0] - input_features = _make_feature_names_in(self, input_features) + input_features = _check_feature_names_in(self, input_features) feature_names = [] for i in range(self.n_features_in_): for j in range(n_splines - 1 + self.include_bias): @@ -723,7 +723,7 @@ def get_feature_names_out(self, input_features=None): Transformed feature names. """ n_splines = self.bsplines_[0].c.shape[0] - input_features = _make_feature_names_in(self, input_features) + input_features = _check_feature_names_in(self, input_features) feature_names = [] for i in range(self.n_features_in_): for j in range(n_splines - 1 + self.include_bias): diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 7b696ea862842..6b18c93129114 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1670,7 +1670,7 @@ def _get_feature_names(X): return feature_names -def _make_feature_names_in(estimator, input_features): +def _check_feature_names_in(estimator, input_features): """Make feature_names_in_ from estimator. Used to validate input from `get_feature_names_out(input_features=None)`. From 13d406b6bd4f7e41b4af2668cffee58bbbb95973 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 18:01:44 -0400 Subject: [PATCH 088/100] DOC Adjust docstring --- sklearn/compose/_column_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index f09a09cc0d97c..5db84315a8ba6 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -116,7 +116,7 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): If True, :meth:`get_feature_names_out` will prefix all feature names with the name of the transformer that generated that feature. If False, :meth:`get_feature_names_out` will not prefix any feature - names and will error if feature names collide. + names and will error if feature names are not unique. .. versionadded:: 1.0 @@ -940,7 +940,7 @@ def make_column_transformer( If True, :meth:`get_feature_names_out` will prefix all feature names with the name of the transformer that generated that feature. If False, :meth:`get_feature_names_out` will not prefix any feature - names and will error if feature names collide. + names and will error if feature names are not unique. .. versionadded:: 1.0 From d04ecec0cc1925a9d8ae789908119672198481c9 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 18:04:50 -0400 Subject: [PATCH 089/100] CLN Remove unneeded code --- sklearn/compose/_column_transformer.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 5db84315a8ba6..daae579563494 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -423,29 +423,6 @@ def get_feature_names(self): feature_names.extend([f"{name}__{f}" for f in trans.get_feature_names()]) return feature_names - def _get_feature_names_out(self, get_names): - """Private function to be used by get_feature_names*.""" - # TODO(1.2): This should be removed and integrated into - # get_feature_names_out when get_feature_names is deprecated. - feature_names = [] - for name, trans, column, _ in self._iter(fitted=True): - if trans == "drop" or _is_empty_column_selection(column): - continue - if trans == "passthrough": - if hasattr(self, "feature_names_in_"): - if (not isinstance(column, slice)) and all( - isinstance(col, str) for col in column - ): - feature_names.extend(column) - else: - feature_names.extend(self.feature_names_in_[column]) - else: - indices = np.arange(self._n_features) - feature_names.extend(["x%d" % i for i in indices[column]]) - continue - feature_names.extend(get_names(name, trans, column)) - return feature_names - def _get_feature_name_out_for_transformer( self, name, trans, column, feature_names_in ): From d3cc5b6aa6490b2e38f9fd14bfc2f2cd26ea2875 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 18:10:20 -0400 Subject: [PATCH 090/100] DOC Better docstring --- doc/glossary.rst | 4 ++-- sklearn/compose/_column_transformer.py | 8 +++++++- sklearn/pipeline.py | 3 ++- sklearn/preprocessing/_encoders.py | 9 +++++++-- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/doc/glossary.rst b/doc/glossary.rst index 7714276c9f8eb..dd492def3adb9 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -1271,8 +1271,8 @@ Methods to the names of input columns from which output column names can be generated. If `input_features` is not passed in, then the `feature_names_in_` attribute will be used. If the - `feature_names_in_` attribute is not defined or `None`, then the - input names are named x0, x1, ..., xn_features_out. + `feature_names_in_` attribute is not defined, then the + input names are named `[x0, x1, ..., x(n_features_in_)]`. ``get_n_splits`` On a :term:`CV splitter` (not an estimator), returns the number of diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index daae579563494..37759da6f874d 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -459,7 +459,13 @@ def get_feature_names_out(self, input_features=None): Parameters ---------- input_features : array-like of str or None, default=None - Not used, present here for API consistency by convention. + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. Returns ------- diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 27bfe0fcf1db4..c4a03e8532260 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -687,7 +687,8 @@ def get_feature_names_out(self, input_features=None): - If `input_features` is `None`, then `feature_names_in_` is used as feature names in. If `feature_names_in_` is not defined, then names are generated: `[x0, x1, ..., x(n_features_in_)]`. - If `n_features_in_` is not defined, then `None` is used. + If `n_features_in_` is not defined, then `None` is used as + input_features. - If `input_features` is an array-like, then `input_features` must match `feature_names_in_` if `feature_names_in_` is defined. diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index ab1c010e68429..3a332a797797f 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -718,8 +718,13 @@ def get_feature_names_out(self, input_features=None): Parameters ---------- input_features : array-like of str or None, default=None - Input features. If None, they are generated as - `[x0, x1, ..., xn_features]`. + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. Returns ------- From 76be3217ca137a215bfdae6d5334febe274372e9 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 31 Aug 2021 19:57:55 -0400 Subject: [PATCH 091/100] TST Fix --- .../inspection/plot_permutation_importance.py | 2 +- sklearn/compose/_column_transformer.py | 2 +- sklearn/pipeline.py | 4 +--- sklearn/tests/test_pipeline.py | 19 ++++++++++++++----- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 4317fa8d5bd87..766a01fbeb12d 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -120,7 +120,7 @@ # capacity). ohe = (rf.named_steps['preprocess'] .named_transformers_['cat']) -feature_names = ohe.get_feature_names_out(input_features=categorical_columns) +feature_names = ohe.get_feature_names_out(categorical_columns) feature_names = np.r_[feature_names, numerical_columns] tree_feature_importances = ( diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 37759da6f874d..adb7e251bd952 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -451,7 +451,7 @@ def _get_feature_name_out_for_transformer( isinstance(col, str) for col in column ): column = _safe_indexing(feature_names_in, column) - return trans.get_feature_names_out(input_features=column) + return trans.get_feature_names_out(column) def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index c4a03e8532260..3b1ba29a94489 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -705,9 +705,7 @@ def get_feature_names_out(self, input_features=None): "Did you mean to call Pipeline[:-1].get_feature_names_out" "()?".format(name) ) - feature_names = transform.get_feature_names_out( - input_features=feature_names - ) + feature_names = transform.get_feature_names_out(input_features) return feature_names @property diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 214a8259fe9f2..445bd9064b959 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1171,20 +1171,29 @@ def test_make_pipeline_memory(): shutil.rmtree(cachedir) +class FeatureNameSaver(BaseEstimator): + def fit(self, X, y=None): + self._check_feature_names(X, reset=True) + return self + + def transform(self, X, y=None): + return X + + def get_feature_names_out(self, input_features=None): + return input_features + + def test_features_names_passthrough(): """Check pipeline.get_feature_names_out with passthrough""" pipe = Pipeline( steps=[ - ("imputer", "passthrough"), - ("scaler", StandardScaler()), - ("select", "passthrough"), + ("names", FeatureNameSaver()), + ("pass", "passthrough"), ("clf", LogisticRegression()), ] ) iris = load_iris() pipe.fit(iris.data, iris.target) - xs = ["x0", "x1", "x2", "x3"] - assert_array_equal(pipe[:-1].get_feature_names_out(), xs) assert_array_equal( pipe[:-1].get_feature_names_out(iris.feature_names), iris.feature_names ) From caff15b2793b0ae61e2d249bc9c7ff8326265b17 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 1 Sep 2021 10:12:55 +0200 Subject: [PATCH 092/100] FIX test_docstring for deprecated get_feature_names --- sklearn/compose/_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index adb7e251bd952..88178b4f6aa93 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -388,7 +388,7 @@ def named_transformers_(self): @deprecated( "get_feature_names is deprecated in 1.0 and will be removed " - "in 1.2. You can use get_feature_names_out instead" + "in 1.2. You can use get_feature_names_out instead." ) def get_feature_names(self): """Get feature names from all transformers. From d930b1bf3950846a8c1e726c200a18779e72903c Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 1 Sep 2021 13:01:10 -0400 Subject: [PATCH 093/100] ENH Error when n_features_in_ is not defined --- sklearn/pipeline.py | 18 ----------- sklearn/preprocessing/_polynomial.py | 6 ++-- sklearn/utils/tests/test_validation.py | 45 ++++++++++++++++++++++++++ sklearn/utils/validation.py | 21 ++++++------ 4 files changed, 58 insertions(+), 32 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 3b1ba29a94489..29f0c3548bbf6 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -27,7 +27,6 @@ from .utils._tags import _safe_tags from .utils.validation import check_memory from .utils.validation import check_is_fitted -from .utils.validation import _check_feature_names_in from .utils.fixes import delayed from .exceptions import NotFittedError @@ -684,20 +683,11 @@ def get_feature_names_out(self, input_features=None): input_features : array-like of str or None, default=None Input features. - - If `input_features` is `None`, then `feature_names_in_` is - used as feature names in. If `feature_names_in_` is not defined, - then names are generated: `[x0, x1, ..., x(n_features_in_)]`. - If `n_features_in_` is not defined, then `None` is used as - input_features. - - If `input_features` is an array-like, then `input_features` must - match `feature_names_in_` if `feature_names_in_` is defined. - Returns ------- feature_names_out : ndarray of str Transformed feature names. """ - feature_names = _check_feature_names_in(self, input_features) for _, name, transform in self._iter(): if not hasattr(transform, "get_feature_names_out"): raise AttributeError( @@ -1041,19 +1031,11 @@ def get_feature_names_out(self, input_features=None): input_features : array-like of str or None, default=None Input features. - - If `input_features` is `None`, then `feature_names_in_` is - used as feature names in. If `feature_names_in_` is not defined, - then names are generated: `[x0, x1, ..., x(n_features_in_)]`. - If `n_features_in_` is not defined, then `None` is used. - - If `input_features` is an array-like, then `input_features` must - match `feature_names_in_` if `feature_names_in_` is defined. - Returns ------- feature_names_out : ndarray of str Transformed feature names. """ - input_features = _check_feature_names_in(self, input_features) feature_names = [] for name, trans, _ in self._iter(): if not hasattr(trans, "get_feature_names_out"): diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index b0aef94cde1f1..6695b09d5612d 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -682,8 +682,7 @@ def _get_base_knot_positions(X, n_knots=10, knots="uniform", sample_weight=None) "in 1.2. You can use get_feature_names_out instead" ) def get_feature_names(self, input_features=None): - """ - Return feature names for output features + """Return feature names for output features. Parameters ---------- @@ -696,7 +695,8 @@ def get_feature_names(self, input_features=None): output_feature_names : list of str of shape (n_output_features,) """ n_splines = self.bsplines_[0].c.shape[0] - input_features = _check_feature_names_in(self, input_features) + if input_features is None: + input_features = ["x%d" % i for i in range(self.n_features_in_)] feature_names = [] for i in range(self.n_features_in_): for j in range(n_splines - 1 + self.include_bias): diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 45c24d696cd40..156790545d7b3 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -51,6 +51,7 @@ _num_features, FLOAT_DTYPES, _get_feature_names, + _check_feature_names_in, ) from sklearn.utils.validation import _check_fit_params from sklearn.base import BaseEstimator @@ -1562,3 +1563,47 @@ def test_get_feature_names_invalid_dtypes_warns(names, dtypes): with pytest.warns(FutureWarning, match=msg): names = _get_feature_names(X) assert names is None + + +class PassthroughTransformer(BaseEstimator): + def fit(self, X, y=None): + self._validate_data(X, reset=True) + return self + + def transform(self, X): + return X + + def get_feature_names_out(self, input_features=None): + return _check_feature_names_in(self, input_features) + + +def test_check_feature_names_in(): + """Check behavior of check_feature_names_in for arrays.""" + X = np.array([[0.0, 1.0, 2.0]]) + est = PassthroughTransformer().fit(X) + + names = est.get_feature_names_out() + assert_array_equal(names, ["x0", "x1", "x2"]) + + incorrect_len_names = ["x10", "x1"] + with pytest.raises(ValueError, match="input_features should have length equal to"): + est.get_feature_names_out(incorrect_len_names) + + # remove n_feature_in_ + del est.n_features_in_ + with pytest.raises(ValueError, match="Unable to generate feature names"): + est.get_feature_names_out() + + +def test_check_feature_names_in_pandas(): + """Check behavior of check_feature_names_in for pandas dataframes.""" + pd = pytest.importorskip("pandas") + names = ["a", "b", "c"] + df = pd.DataFrame([[0.0, 1.0, 2.0]], columns=names) + est = PassthroughTransformer().fit(df) + + names = est.get_feature_names_out() + assert_array_equal(names, ["a", "b", "c"]) + + with pytest.raises(ValueError, match="input_features is not equal to"): + est.get_feature_names_out(["x1", "x2", "x3"]) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 412d3b6ad29f6..4aaae46553ad7 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1671,11 +1671,13 @@ def _get_feature_names(X): def _check_feature_names_in(estimator, input_features): - """Make feature_names_in_ from estimator. + """Make feature names in from `estimator.get_feature_names_out`. Used to validate input from `get_feature_names_out(input_features=None)`. """ feature_names_in_ = getattr(estimator, "feature_names_in_", None) + n_features_in_ = getattr(estimator, "n_features_in_", None) + if input_features is not None: input_features = np.asarray(input_features, dtype=object) if feature_names_in_ is not None and not np.array_equal( @@ -1683,21 +1685,18 @@ def _check_feature_names_in(estimator, input_features): ): raise ValueError("input_features is not equal to feature_names_in_") - if ( - hasattr(estimator, "n_features_in_") - and len(input_features) != estimator.n_features_in_ - ): + if n_features_in_ is not None and len(input_features) != n_features_in_: raise ValueError( "input_features should have length equal to number of " - f"features ({estimator.n_features_in_}), got {len(input_features)}" + f"features ({n_features_in_}), got {len(input_features)}" ) return input_features if feature_names_in_ is not None: return feature_names_in_ - # Only generates features if `n_features_in_` is defined - if hasattr(estimator, "n_features_in_"): - return np.asarray( - [f"x{i}" for i in range(estimator.n_features_in_)], dtype=object - ) + # Generates features if `n_features_in_` is defined + if n_features_in_ is None: + raise ValueError("Unable to generate feature names without n_features_in_") + + return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object) From b379cd30a4c54d27c1fd3a3f7914a45c4537dfb5 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 1 Sep 2021 13:05:24 -0400 Subject: [PATCH 094/100] DOC Update docstring --- sklearn/utils/validation.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 4aaae46553ad7..c213d6c9b8a27 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1671,10 +1671,7 @@ def _get_feature_names(X): def _check_feature_names_in(estimator, input_features): - """Make feature names in from `estimator.get_feature_names_out`. - - Used to validate input from `get_feature_names_out(input_features=None)`. - """ + """Validate and make feature names in from `estimator.get_feature_names_out`.""" feature_names_in_ = getattr(estimator, "feature_names_in_", None) n_features_in_ = getattr(estimator, "n_features_in_", None) From 83d12ecf313e2e483f10ed405787154ded53c56d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 1 Sep 2021 13:33:08 -0400 Subject: [PATCH 095/100] CLN Address comments --- sklearn/compose/_column_transformer.py | 12 +++++----- .../feature_extraction/_dict_vectorizer.py | 4 ++-- sklearn/feature_extraction/text.py | 4 ++-- sklearn/feature_selection/_base.py | 2 +- sklearn/pipeline.py | 6 ++--- sklearn/preprocessing/_discretization.py | 2 +- sklearn/preprocessing/_encoders.py | 4 ++-- sklearn/preprocessing/_polynomial.py | 8 +++---- sklearn/utils/validation.py | 24 ++++++++++++++++--- 9 files changed, 42 insertions(+), 24 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 88178b4f6aa93..f4e602cc44f3d 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -388,7 +388,7 @@ def named_transformers_(self): @deprecated( "get_feature_names is deprecated in 1.0 and will be removed " - "in 1.2. You can use get_feature_names_out instead." + "in 1.2. Please use get_feature_names_out instead." ) def get_feature_names(self): """Get feature names from all transformers. @@ -428,8 +428,8 @@ def _get_feature_name_out_for_transformer( ): """Gets feature names of transformer. - Used conduction with self._iter(fitted=True) in get_feature_names_out.""" - + Used in conjunction with self._iter(fitted=True) in get_feature_names_out. + """ if trans == "drop" or _is_empty_column_selection(column): return elif trans == "passthrough": @@ -469,7 +469,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names_out : ndarray of str + feature_names_out : ndarray of str objects Transformed feature names. """ check_is_fitted(self) @@ -489,8 +489,8 @@ def get_feature_names_out(self, input_features=None): # No feature names return np.array([], dtype=object) - # always prefix the feature names out with the transformers name if self.prefix_feature_names_out: + # Prefix the feature names out with the transformers name names = list( chain.from_iterable( (f"{name}__{i}" for i in feature_names_out) @@ -499,7 +499,7 @@ def get_feature_names_out(self, input_features=None): ) return np.asarray(names, dtype=object) - # prefix_feature_names_out==False + # prefix_feature_names_out is False # Check that names are all unique without a prefix feature_names_count = Counter( chain.from_iterable(s for _, s in transformer_with_feature_names_out) diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 7046230efa8ae..35aaf9742fc4e 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -376,7 +376,7 @@ def transform(self, X): @deprecated( "get_feature_names is deprecated in 1.0 and will be removed " - "in 1.2. You can use get_feature_names_out instead" + "in 1.2. Please use get_feature_names_out instead." ) def get_feature_names(self): """Return a list of feature names, ordered by their indices. @@ -401,7 +401,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names_out : ndarray of str + feature_names_out : ndarray of str objects Transformed feature names. """ if any(not isinstance(name, str) for name in self.feature_names_): diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 8d20488f72bca..36ea1efd71fb8 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1390,7 +1390,7 @@ def inverse_transform(self, X): @deprecated( "get_feature_names is deprecated in 1.0 and will be removed " - "in 1.2. You can use get_feature_names_out instead." + "in 1.2. Please use get_feature_names_out instead." ) def get_feature_names(self): """Array mapping from feature integer indices to feature name. @@ -1414,7 +1414,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names_out : ndarray of str + feature_names_out : ndarray of str objects Transformed feature names. """ self._check_vocabulary() diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index c1eb23bcae852..6cdab9305f067 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -155,7 +155,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names_out : ndarray of str + feature_names_out : ndarray of str objects Transformed feature names. """ input_features = _check_feature_names_in(self, input_features) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 29f0c3548bbf6..f3dfc87923ade 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -685,7 +685,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names_out : ndarray of str + feature_names_out : ndarray of str objects Transformed feature names. """ for _, name, transform in self._iter(): @@ -1003,7 +1003,7 @@ def _iter(self): @deprecated( "get_feature_names is deprecated in 1.0 and will be removed " - "in 1.2. You can use get_feature_names_out instead" + "in 1.2. Please use get_feature_names_out instead." ) def get_feature_names(self): """Get feature names from all transformers. @@ -1033,7 +1033,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names_out : ndarray of str + feature_names_out : ndarray of str objects Transformed feature names. """ feature_names = [] diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 17fc854c1a8dc..24e9df1050d6f 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -394,7 +394,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names_out : ndarray of str + feature_names_out : ndarray of str objects Transformed feature names. """ input_features = _check_feature_names_in(self, input_features) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 3a332a797797f..45af02d93de98 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -672,7 +672,7 @@ def inverse_transform(self, X): @deprecated( "get_feature_names is deprecated in 1.0 and will be removed " - "in 1.2. You can use get_feature_names_out instead." + "in 1.2. Please use get_feature_names_out instead." ) def get_feature_names(self, input_features=None): """Return feature names for output features. @@ -728,7 +728,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names_out : ndarray of str + feature_names_out : ndarray of str objects Transformed feature names. """ check_is_fitted(self) diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 6695b09d5612d..cf47de062b94d 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -196,7 +196,7 @@ def powers_(self): @deprecated( "get_feature_names is deprecated in 1.0 and will be removed " - "in 1.2. You can use get_feature_names_out instead" + "in 1.2. Please use get_feature_names_out instead." ) def get_feature_names(self, input_features=None): """ @@ -246,7 +246,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names_out : ndarray of str + feature_names_out : ndarray of str objects Transformed feature names. """ powers = self.powers_ @@ -679,7 +679,7 @@ def _get_base_knot_positions(X, n_knots=10, knots="uniform", sample_weight=None) @deprecated( "get_feature_names is deprecated in 1.0 and will be removed " - "in 1.2. You can use get_feature_names_out instead" + "in 1.2. Please use get_feature_names_out instead." ) def get_feature_names(self, input_features=None): """Return feature names for output features. @@ -719,7 +719,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - feature_names_out : ndarray of str + feature_names_out : ndarray of str objects Transformed feature names. """ n_splines = self.bsplines_[0].c.shape[0] diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index c213d6c9b8a27..f2b77d012351a 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1670,8 +1670,26 @@ def _get_feature_names(X): return feature_names -def _check_feature_names_in(estimator, input_features): - """Validate and make feature names in from `estimator.get_feature_names_out`.""" +def _check_feature_names_in(estimator, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_in : ndarray of str + Feature names in. + """ + feature_names_in_ = getattr(estimator, "feature_names_in_", None) n_features_in_ = getattr(estimator, "n_features_in_", None) @@ -1692,7 +1710,7 @@ def _check_feature_names_in(estimator, input_features): if feature_names_in_ is not None: return feature_names_in_ - # Generates features if `n_features_in_` is defined + # Generates feature names if `n_features_in_` is defined if n_features_in_ is None: raise ValueError("Unable to generate feature names without n_features_in_") From 8d0b3cfbcaf678102c5dccdb73862bb476080c81 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 6 Sep 2021 19:02:34 +0200 Subject: [PATCH 096/100] Update sklearn/pipeline.py Co-authored-by: Olivier Grisel --- sklearn/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index f3dfc87923ade..c03e808ae3e23 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -692,7 +692,7 @@ def get_feature_names_out(self, input_features=None): if not hasattr(transform, "get_feature_names_out"): raise AttributeError( "Estimator {} does not provide get_feature_names_out. " - "Did you mean to call Pipeline[:-1].get_feature_names_out" + "Did you mean to call pipeline[:-1].get_feature_names_out" "()?".format(name) ) feature_names = transform.get_feature_names_out(input_features) From c35f7aaf00acd1719cc2e062eb0f4c253222010a Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 6 Sep 2021 19:06:20 +0200 Subject: [PATCH 097/100] Update doc/glossary.rst --- doc/glossary.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/glossary.rst b/doc/glossary.rst index 144f626ba6888..1843e6f76f0a6 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -1266,7 +1266,7 @@ Methods ``get_feature_names_out`` Primarily for :term:`feature extractors`, but also used for other transformers to provide string names for each column in the output of - the estimator's :term:`transform` method. It outputs a list of + the estimator's :term:`transform` method. It outputs an array of strings and may take a list of strings as input, corresponding to the names of input columns from which output column names can be generated. If `input_features` is not passed in, then the From ec8b82523d06f0d51181cbc998a115162862c5d1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 6 Sep 2021 19:06:32 +0200 Subject: [PATCH 098/100] Update doc/glossary.rst --- doc/glossary.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/glossary.rst b/doc/glossary.rst index 1843e6f76f0a6..010f16a361531 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -1267,7 +1267,7 @@ Methods Primarily for :term:`feature extractors`, but also used for other transformers to provide string names for each column in the output of the estimator's :term:`transform` method. It outputs an array of - strings and may take a list of strings as input, corresponding + strings and may take an array-like of strings as input, corresponding to the names of input columns from which output column names can be generated. If `input_features` is not passed in, then the `feature_names_in_` attribute will be used. If the From 560c0d084c586d87efeddc5b593ca2a7ffabc0d1 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 6 Sep 2021 17:01:36 -0400 Subject: [PATCH 099/100] ENH Adds one-to-one transformers --- sklearn/base.py | 30 +++++++++++++++++++ sklearn/feature_extraction/tests/test_text.py | 10 +++++++ sklearn/feature_extraction/text.py | 4 +-- sklearn/preprocessing/_data.py | 14 ++++----- sklearn/preprocessing/tests/test_data.py | 18 +++++++++++ 5 files changed, 67 insertions(+), 9 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index a585b2b06c394..60fc82eff6088 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -23,6 +23,7 @@ from .utils.validation import check_array from .utils.validation import _check_y from .utils.validation import _num_features +from .utils.validation import _check_feature_names_in from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _get_feature_names @@ -846,6 +847,35 @@ def fit_transform(self, X, y=None, **fit_params): return self.fit(X, y, **fit_params).transform(X) +class _OneToOneFeatureMixin: + """Provides `get_feature_names_out` for simple transformers. + + Assumes there's a 1-to-1 correspondence between input features + and output features. + """ + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Same as input features. + """ + return _check_feature_names_in(self, input_features) + + class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index ed9bff59db952..cc4ff2ec49492 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -449,6 +449,16 @@ def test_countvectorizer_uppercase_in_vocab(): vectorizer.fit_transform(vocabulary) +def test_tf_transformer_feature_names_out(): + """Check get_feature_names_out for TfidfTransformer""" + X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] + tr = TfidfTransformer(smooth_idf=True, norm="l2").fit(X) + + feature_names_in = ["a", "c", "b"] + feature_names_out = tr.get_feature_names_out(feature_names_in) + assert_array_equal(feature_names_in, feature_names_out) + + def test_tf_idf_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=True, norm="l2") diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 36ea1efd71fb8..da0e35609252f 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -25,7 +25,7 @@ import numpy as np import scipy.sparse as sp -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _OneToOneFeatureMixin from ..preprocessing import normalize from ._hash import FeatureHasher from ._stop_words import ENGLISH_STOP_WORDS @@ -1432,7 +1432,7 @@ def _make_int_array(): return array.array(str("i")) -class TfidfTransformer(TransformerMixin, BaseEstimator): +class TfidfTransformer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Transform a count matrix to a normalized tf or tf-idf representation. Tf means term-frequency while tf-idf means term-frequency times inverse diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index a7459d5e887c1..4c1bcaa0c921d 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -16,7 +16,7 @@ from scipy import optimize from scipy.special import boxcox -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _OneToOneFeatureMixin from ..utils import check_array from ..utils.deprecation import deprecated from ..utils.extmath import _incremental_mean_and_var, row_norms @@ -262,7 +262,7 @@ def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): return X -class MinMaxScaler(TransformerMixin, BaseEstimator): +class MinMaxScaler(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Transform features by scaling each feature to a given range. This estimator scales and translates each feature individually such @@ -629,7 +629,7 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): return X -class StandardScaler(TransformerMixin, BaseEstimator): +class StandardScaler(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Standardize features by removing the mean and scaling to unit variance. The standard score of a sample `x` is calculated as: @@ -1041,7 +1041,7 @@ def _more_tags(self): return {"allow_nan": True, "preserves_dtype": [np.float64, np.float32]} -class MaxAbsScaler(TransformerMixin, BaseEstimator): +class MaxAbsScaler(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Scale each feature by its maximum absolute value. This estimator scales and translates each feature individually such @@ -1341,7 +1341,7 @@ def maxabs_scale(X, *, axis=0, copy=True): return X -class RobustScaler(TransformerMixin, BaseEstimator): +class RobustScaler(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to @@ -2337,7 +2337,7 @@ def add_dummy_feature(X, value=1.0): return np.hstack((np.full((n_samples, 1), value), X)) -class QuantileTransformer(TransformerMixin, BaseEstimator): +class QuantileTransformer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Transform features using quantiles information. This method transforms the features to follow a uniform or a normal @@ -2921,7 +2921,7 @@ def quantile_transform( ) -class PowerTransformer(TransformerMixin, BaseEstimator): +class PowerTransformer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Apply a power transform featurewise to make data more Gaussian-like. Power transforms are a family of parametric, monotonic transformations diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index bc24b2905ca5c..9cbfbb4241652 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2642,3 +2642,21 @@ def test_standard_scaler_raise_error_for_1d_input(): err_msg = "Expected 2D array, got 1D array instead" with pytest.raises(ValueError, match=err_msg): scaler.inverse_transform(X_2d[:, 0]) + + +@pytest.mark.parametrize( + "Transformer", + [ + MinMaxScaler, + MaxAbsScaler, + RobustScaler, + StandardScaler, + QuantileTransformer, + PowerTransformer, + ], +) +def test_one_to_one_features(Transformer): + """Check one-to-one transformers give correct feature names.""" + tr = Transformer().fit(iris.data) + names_out = tr.get_feature_names_out(iris.feature_names) + assert_array_equal(names_out, iris.feature_names) From 043540b86505de511b749ac9fa0eb2082f6d2106 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 7 Sep 2021 10:26:55 +0200 Subject: [PATCH 100/100] Add one more test for one-to-one feature transformers with pandas --- sklearn/preprocessing/tests/test_data.py | 31 ++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 9cbfbb4241652..4014465ab7eab 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -7,6 +7,7 @@ import warnings import itertools +import re import numpy as np import numpy.linalg as la from scipy import sparse, stats @@ -2660,3 +2661,33 @@ def test_one_to_one_features(Transformer): tr = Transformer().fit(iris.data) names_out = tr.get_feature_names_out(iris.feature_names) assert_array_equal(names_out, iris.feature_names) + + +@pytest.mark.parametrize( + "Transformer", + [ + MinMaxScaler, + MaxAbsScaler, + RobustScaler, + StandardScaler, + QuantileTransformer, + PowerTransformer, + ], +) +def test_one_to_one_features_pandas(Transformer): + """Check one-to-one transformers give correct feature names.""" + pd = pytest.importorskip("pandas") + + df = pd.DataFrame(iris.data, columns=iris.feature_names) + tr = Transformer().fit(df) + + names_out_df_default = tr.get_feature_names_out() + assert_array_equal(names_out_df_default, iris.feature_names) + + names_out_df_valid_in = tr.get_feature_names_out(iris.feature_names) + assert_array_equal(names_out_df_valid_in, iris.feature_names) + + msg = re.escape("input_features is not equal to feature_names_in_") + with pytest.raises(ValueError, match=msg): + invalid_names = list("abcd") + tr.get_feature_names_out(invalid_names)