diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 6388c9b7d4323..2e49615c3a58f 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -139,6 +139,32 @@ or by name:: >>> pipe['reduce_dim'] PCA() +To enable model inspection, `Pipeline` sets an ``input_features_`` attribute on +all pipeline steps during fitting. This allows the user to understand how +features are transformed during a pipeline:: + + >>> from sklearn.datasets import load_iris + >>> from sklearn.feature_selection import SelectKBest + >>> iris = load_iris() + >>> pipe = Pipeline(steps=[ + ... ('select', SelectKBest(k=2)), + ... ('clf', LogisticRegression())]) + >>> pipe.fit(iris.data, iris.target) + ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + Pipeline(memory=None, + steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))]) + >>> pipe.named_steps.clf.input_features_ + array(['x2', 'x3'], dtype='>> pipe.get_feature_names(iris.feature_names) + >>> pipe.named_steps.select.input_features_ + ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] + >>> pipe.named_steps.clf.input_features_ + array(['petal length (cm)', 'petal width (cm)'], dtype='>> from sklearn.feature_extraction.text import CountVectorizer >>> from sklearn.preprocessing import OneHotEncoder >>> column_trans = ColumnTransformer( - ... [('city_category', OneHotEncoder(dtype='int'),['city']), + ... [('categories', OneHotEncoder(dtype='int'),['city']), ... ('title_bow', CountVectorizer(), 'title')], ... remainder='drop') @@ -438,11 +464,11 @@ By default, the remaining rating columns are ignored (``remainder='drop'``):: ('title_bow', CountVectorizer(), 'title')]) >>> column_trans.get_feature_names() - ['city_category__x0_London', 'city_category__x0_Paris', 'city_category__x0_Sallisaw', - 'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his', - 'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable', - 'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson', - 'title_bow__wrath'] + ['categories__city_London', 'categories__city_Paris', + 'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast', + 'title_bow__grapes', 'title_bow__his', 'title_bow__how', 'title_bow__last', + 'title_bow__learned', 'title_bow__moveable', 'title_bow__of', 'title_bow__the', + 'title_bow__trick', 'title_bow__watson', 'title_bow__wrath'] >>> column_trans.transform(X).toarray() array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0], diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index bd4ed48cabfd9..20c1416a34ffe 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -145,6 +145,50 @@ clf.fit(X_train, y_train) print("model score: %.3f" % clf.score(X_test, y_test)) + +############################################################################### +# Inspecting the coefficients values of the classifier +############################################################################### +# The coefficients of the final classification step of the pipeline gives an +# idea how each feature impacts the likelihood of survival assuming that the +# usual linear model assumptions hold (uncorrelated features, linear +# separability, homoschedastic errors...) which we do not verify in this +# example. +# +# To get error bars we perform cross-validation and compute the mean and +# standard deviation for each coefficient accross CV splits. Because we use a +# standard scaler on the numerical features, the coefficient weights gives us +# an idea on how much the log odds of surviving are impacted by a change in +# this dimension contrasted to the mean. Note that the categorical features +# here are overspecified which makes it slightly harder to interpret because of +# the information redundancy. +# +# We can see that the linear model coefficients are in agreement with the +# historical reports: people in higher classes and therefore in the upper decks +# were the first to reach the lifeboats, and often, priority was given to women +# and children. +# +# Note that conditionned on the "pclass_x" one-hot features, the "fare" +# numerical feature does not seem to be significantly predictive. If we drop +# the "pclass" feature, then higher "fare" values would appear significantly +# correlated with a higher likelihood of survival as the "fare" and "pclass" +# features have a strong statistical dependency. + +import matplotlib.pyplot as plt +from sklearn.model_selection import cross_validate +from sklearn.model_selection import StratifiedShuffleSplit + +cv = StratifiedShuffleSplit(n_splits=20, test_size=0.25, random_state=42) +cv_results = cross_validate(clf, X_train, y_train, cv=cv, + return_estimator=True) +cv_coefs = np.concatenate([cv_pipeline.named_steps["classifier"].coef_ + for cv_pipeline in cv_results["estimator"]]) +fig, ax = plt.subplots() +ax.barh(clf.named_steps["classifier"].input_features_, + cv_coefs.mean(axis=0), xerr=cv_coefs.std(axis=0)) +plt.tight_layout() +plt.show() + ############################################################################### # The resulting score is not exactly the same as the one from the previous # pipeline becase the dtype-based selector treats the ``pclass`` columns as diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py index b908def5c6709..5d1826fc5a511 100644 --- a/examples/feature_selection/plot_feature_selection_pipeline.py +++ b/examples/feature_selection/plot_feature_selection_pipeline.py @@ -9,6 +9,7 @@ Using a sub-pipeline, the fitted coefficients can be mapped back into the original feature space. """ +import matplotlib.pyplot as plt from sklearn import svm from sklearn.datasets import make_classification from sklearn.feature_selection import SelectKBest, f_regression @@ -36,5 +37,7 @@ y_pred = anova_svm.predict(X_test) print(classification_report(y_test, y_pred)) -coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_) -print(coef) +# access and plot the coefficients of the fitted model +plt.barh((0, 1, 2), anova_svm[-1].coef_.ravel()) +plt.yticks((0, 1, 2), anova_svm[-1].input_features_) +plt.show() diff --git a/sklearn/base.py b/sklearn/base.py index a4cc26acc0c9a..07b2f8cdf5b20 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -6,6 +6,7 @@ import copy import warnings from collections import defaultdict + import platform import inspect import re @@ -13,6 +14,7 @@ import numpy as np from . import __version__ +from .exception import NotFittedError from ._config import get_config from .utils import _IS_32BIT from .utils.validation import check_X_y @@ -20,6 +22,7 @@ from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _deprecate_positional_args + _DEFAULT_TAGS = { 'non_deterministic': False, 'requires_positive_X': False, @@ -688,6 +691,49 @@ def fit_transform(self, X, y=None, **fit_params): # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X) + def get_feature_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : list of string or None + String names of the input features. + + Returns + ------- + output_feature_names : list of string + Feature names for transformer output. + """ + # OneToOneMixin is higher in the class hierarchy + # because we put mixins on the wrong side + if hasattr(super(), 'get_feature_names'): + return super().get_feature_names(input_features) + # generate feature names from class name by default + # would be much less guessing if we stored the number + # of output features. + # Ideally this would be done in each class. + if hasattr(self, 'n_clusters'): + # this is before n_components_ + # because n_components_ means something else + # in agglomerative clustering + n_features = self.n_clusters + elif hasattr(self, '_max_components'): + # special case for LinearDiscriminantAnalysis + n_components = self.n_components or np.inf + n_features = min(self._max_components, n_components) + elif hasattr(self, 'n_components_'): + # n_components could be auto or None + # this is more likely to be an int + n_features = self.n_components_ + elif hasattr(self, 'n_components') and self.n_components is not None: + n_features = self.n_components + elif hasattr(self, 'components_'): + n_features = self.components_.shape[0] + else: + return None + return ["{}{}".format(type(self).__name__.lower(), i) + for i in range(n_features)] + class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" @@ -736,10 +782,81 @@ def fit_predict(self, X, y=None): return self.fit(X).predict(X) +class OneToOneMixin(object): + """Provides get_feature_names for simple transformers + + Assumes there's a 1-to-1 correspondence between input features + and output features. + """ + + def get_feature_names(self, input_features=None): + """Get feature names for transformation. + + Returns input_features as this transformation + doesn't add or drop features. + + Parameters + ---------- + input_features : array-like of string + Input feature names. + + Returns + ------- + feature_names : array-like of string + Transformed feature names + """ + if input_features is not None: + return input_features + else: + raise ValueError("Don't know how to get" + " input feature names for {}".format(self)) + + +def _get_sub_estimators(est): + # Explicitly declare all fitted subestimators of existing meta-estimators + sub_ests = [] + # OHE is not really needed + sub_names = ['estimator_', 'base_estimator_', 'one_hot_encoder_', + 'best_estimator_', 'init_'] + for name in sub_names: + sub_est = getattr(est, name, None) + if sub_est is not None: + sub_ests.append(sub_est) + if hasattr(est, "estimators_"): + if hasattr(est.estimators_, 'shape'): + sub_ests.extend(est.estimators_.ravel()) + else: + sub_ests.extend(est.estimators_) + return sub_ests + + class MetaEstimatorMixin: _required_parameters = ["estimator"] """Mixin class for all meta estimators in scikit-learn.""" + def get_feature_names(self, input_features=None): + """Ensure feature names are set on sub-estimators + + Parameters + ---------- + input_features : list of string or None + Input features to the meta-estimator. + """ + sub_ests = _get_sub_estimators(self) + for est in sub_ests: + est.input_features_ = input_features + if hasattr(est, "get_feature_names"): + # doing hassattr instead of a try-except on everything + # b/c catching AttributeError makes recursive code + # impossible to debug + try: + est.get_feature_names(input_features=input_features) + except TypeError: + # do we need this? + est.get_feature_names() + except NotFittedError: + pass + class MultiOutputMixin: """Mixin to mark estimators that support multioutput.""" diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index f148633021a97..f4e3d2dfd3820 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -371,8 +371,12 @@ def get_feature_names(self): raise AttributeError("Transformer %s (type %s) does not " "provide get_feature_names." % (str(name), type(trans).__name__)) + try: + more_names = trans.get_feature_names(input_features=column) + except TypeError: + more_names = trans.get_feature_names() feature_names.extend([name + "__" + f for f in - trans.get_feature_names()]) + more_names]) return feature_names def _update_fitted_transformers(self, transformers): diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index a9f1764eb97e4..93cc2debcf52d 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -23,6 +23,7 @@ from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder from sklearn.feature_extraction import DictVectorizer +from sklearn.pipeline import make_pipeline class Trans(BaseEstimator): @@ -659,6 +660,18 @@ def test_column_transformer_get_feature_names(): assert_raise_message(AttributeError, "Transformer trans (type Trans) does not provide " "get_feature_names", ct.get_feature_names) + + # if some transformers support and some don't + ct = ColumnTransformer([('trans', Trans(), [0, 1]), + ('scale', StandardScaler(), [0])]) + ct.fit(X_array) + assert_raise_message(AttributeError, + "Transformer trans (type Trans) does not provide " + "get_feature_names", ct.get_feature_names) + + # inside a pipeline + make_pipeline(ct).fit(X_array) + # working example X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}], diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 98d606961c1e1..f63ff1e0c81f4 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -2357,3 +2357,12 @@ def transform(self, X): """ check_is_fitted(self) return self.one_hot_encoder_.transform(self.apply(X)) + + def get_feature_names(self, input_features=None): + """Feature names - not implemented yet. + + Parameters + ---------- + input_features : list of strings or None + """ + return None \ No newline at end of file diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 8644cf0ca2aef..babb0aa84ddbf 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -125,6 +125,21 @@ def inverse_transform(self, X): Xt[:, support] = X return Xt + def get_feature_names(self, input_features=None): + """Mask feature names according to selected features. + + Parameters + ---------- + input_features : list of string or None + Input features to select from. If none, they are generated as + x0, x1, ..., xn. + """ + mask = self.get_support() + if input_features is None: + input_features = ['x%d' % i + for i in range(mask.shape[0])] + return np.array(input_features)[mask] + def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1): diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 517de982d8478..b8ad193622e4c 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -312,6 +312,15 @@ def fit(self, X, y=None): self.strategy, self.missing_values, fill_value) + + if self.add_indicator: + self.indicator_ = MissingIndicator( + missing_values=self.missing_values) + self.indicator_.fit(X) + else: + self.indicator_ = None + invalid_mask = _get_mask(self.statistics_, np.nan) + self._valid_mask = np.logical_not(invalid_mask) return self def _sparse_fit(self, X, strategy, missing_values, fill_value): @@ -462,6 +471,28 @@ def transform(self, X): return super()._concatenate_indicator(X, X_indicator) + def _more_tags(self): + return {'allow_nan': True} + + def get_feature_names(self, input_features=None): + """Get feature names for transformation. + + Parameters + ---------- + input_features : array-like of string + Input feature names. + + Returns + ------- + feature_names : array-like of string + Transformed feature names + """ + check_is_fitted(self, 'statistics_') + if input_features is None: + input_features = ['x%d' % i + for i in range(self.statistics_.shape[0])] + return np.array(input_features)[self._valid_mask] + class MissingIndicator(TransformerMixin, BaseEstimator): """Binary indicators for missing values. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index ca9546696e94d..8c6ce36ac5d07 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -618,6 +618,36 @@ def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) + def get_feature_names(self, input_features=None): + """Get feature names for transformation. + + Transform input features using the pipeline. + If the last step is a transformer, it's included + in the transformation, otherwise it's not. + + Parameters + ---------- + input_features : array-like of string + Input feature names. + + Returns + ------- + feature_names : array-like of string + Transformed feature names + """ + feature_names = input_features + with_final = hasattr(self._final_estimator, "transform") + for i, name, transform in self._iter(with_final=with_final): + if not hasattr(transform, "get_feature_names"): + raise TypeError("Transformer {} does provide" + " get_feature_names".format(name)) + try: + feature_names = transform.get_feature_names( + input_features=feature_names) + except TypeError: + feature_names = transform.get_feature_names() + return feature_names + @property def n_features_in_(self): # delegate to first step (which will call _check_is_fitted) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index cc8776951f114..d1b6b95b2e4ea 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -19,7 +19,7 @@ from scipy import optimize from scipy.special import boxcox -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, OneToOneMixin from ..utils import check_array from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var @@ -197,7 +197,7 @@ def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): return X -class MinMaxScaler(TransformerMixin, BaseEstimator): +class MinMaxScaler(TransformerMixin, OneToOneMixin, BaseEstimator): """Transform features by scaling each feature to a given range. This estimator scales and translates each feature individually such @@ -514,13 +514,13 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): return X -class StandardScaler(TransformerMixin, BaseEstimator): +class StandardScaler(TransformerMixin, OneToOneMixin, BaseEstimator): """Standardize features by removing the mean and scaling to unit variance The standard score of a sample `x` is calculated as: z = (x - u) / s - + where `u` is the mean of the training samples or zero if `with_mean=False`, and `s` is the standard deviation of the training samples or one if `with_std=False`. @@ -851,7 +851,7 @@ def _more_tags(self): return {'allow_nan': True} -class MaxAbsScaler(TransformerMixin, BaseEstimator): +class MaxAbsScaler(TransformerMixin, OneToOneMixin, BaseEstimator): """Scale each feature by its maximum absolute value. This estimator scales and translates each feature individually such @@ -1089,7 +1089,7 @@ def maxabs_scale(X, *, axis=0, copy=True): return X -class RobustScaler(TransformerMixin, BaseEstimator): +class RobustScaler(TransformerMixin, OneToOneMixin, BaseEstimator): """Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to @@ -1746,7 +1746,7 @@ def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False): return X -class Normalizer(TransformerMixin, BaseEstimator): +class Normalizer(TransformerMixin, OneToOneMixin, BaseEstimator): """Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one @@ -1888,7 +1888,7 @@ def binarize(X, *, threshold=0.0, copy=True): return X -class Binarizer(TransformerMixin, BaseEstimator): +class Binarizer(TransformerMixin, OneToOneMixin, BaseEstimator): """Binarize data (set feature values to 0 or 1) according to a threshold Values greater than the threshold map to 1, while values less than @@ -2141,7 +2141,7 @@ def add_dummy_feature(X, value=1.0): return np.hstack((np.full((n_samples, 1), value), X)) -class QuantileTransformer(TransformerMixin, BaseEstimator): +class QuantileTransformer(TransformerMixin, OneToOneMixin, BaseEstimator): """Transform features using quantiles information. This method transforms the features to follow a uniform or a normal @@ -2695,7 +2695,7 @@ def quantile_transform(X, *, axis=0, n_quantiles=1000, " axis={}".format(axis)) -class PowerTransformer(TransformerMixin, BaseEstimator): +class PowerTransformer(TransformerMixin, OneToOneMixin, BaseEstimator): """Apply a power transform featurewise to make data more Gaussian-like. Power transforms are a family of parametric, monotonic transformations diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index db5c88051346a..978aa9eb640fb 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -493,11 +493,109 @@ def test_tag_inheritance(): assert not redefine_tags_est._get_tags()['allow_nan'] diamond_tag_est = DiamondOverwriteTag() + with pytest.raises(TypeError, match="Inconsistent values for tag"): + diamond_tag_est._get_tags() assert diamond_tag_est._get_tags()['allow_nan'] inherit_diamond_tag_est = InheritDiamondOverwriteTag() assert inherit_diamond_tag_est._get_tags()['allow_nan'] +@ignore_warnings(category=(FutureWarning, DeprecationWarning)) +def test_sub_estimator_consistency(): + # check that _get_sub_estimators finds all fitted sub estimators + # if this breaks, you probably introduced a sub-estimator that's + # non-standard (not estimator_, base_estimator_ or estimators_) + from sklearn.utils.testing import all_estimators + from sklearn.base import (MetaEstimatorMixin, _get_sub_estimators, + ClassifierMixin, RegressorMixin) + + from sklearn.model_selection._search import BaseSearchCV + from sklearn.feature_selection.base import SelectorMixin + from sklearn.datasets import make_blobs + from sklearn.linear_model import Ridge, LogisticRegression + from sklearn.utils.estimator_checks import \ + multioutput_estimator_convert_y_2d + from collections.abc import Iterable + + def has_fitted_attr(est): + attrs = [(x, getattr(est, x, None)) + for x in dir(est) if x.endswith("_") + and not x.startswith("__")] + return len(attrs) + + def get_sub_estimators_brute(est): + # recurse through all attributes to get sub-estimators + attrs = [(x, getattr(est, x, None)) + for x in dir(est) if not x.startswith("_")] + + def _recurse_sub_ests(candidates): + sub_ests = [] + for a in candidates: + if hasattr(a, "set_params") and hasattr(a, "fit"): + sub_ests.append(a) + elif isinstance(a, Iterable) and not isinstance(a, str): + sub_ests.extend(_recurse_sub_ests(a)) + return sub_ests + ests = _recurse_sub_ests(attrs) + # we don't consider label processors child estimators + return set([e for e in ests if has_fitted_attr(e) + and e.__module__ != "sklearn.preprocessing.label"]) + + al = all_estimators() + mets = [x for x in al if issubclass(x[1], MetaEstimatorMixin)] + + X, y = make_blobs() + others = [] + + for name, Est in mets: + # instantiate and fit + try: + est = Est() + except TypeError: + if issubclass(Est, (ClassifierMixin, SelectorMixin)): + est = Est(LogisticRegression(solver='lbfgs', + multi_class='auto')) + elif issubclass(Est, RegressorMixin): + est = Est(Ridge()) + else: + others.append((name, Est)) + if est._get_tags()['_skip_test']: + continue + + y = multioutput_estimator_convert_y_2d(est, y) + est.fit(X, y) + # test recursive sub estimators are the same as result of + # _get_sub_estimators which uses a hard-coded list + assert (set(_get_sub_estimators(est)) == + get_sub_estimators_brute(est)) + + for name, Est in others: + # only things we couldn't instantiate are the search CV + assert issubclass(Est, BaseSearchCV) + + +# XXX: Remove in 0.23 +def test_regressormixin_score_multioutput(): + from sklearn.linear_model import LinearRegression + # no warnings when y_type is continuous + X = [[1], [2], [3]] + y = [1, 2, 3] + reg = LinearRegression().fit(X, y) + assert_no_warnings(reg.score, X, y) + # warn when y_type is continuous-multioutput + y = [[1, 2], [2, 3], [3, 4]] + reg = LinearRegression().fit(X, y) + msg = ("The default value of multioutput (not exposed in " + "score method) will change from 'variance_weighted' " + "to 'uniform_average' in 0.23 to keep consistent " + "with 'metrics.r2_score'. To specify the default " + "value manually and avoid the warning, please " + "either call 'metrics.r2_score' directly or make a " + "custom scorer with 'metrics.make_scorer' (the " + "built-in scorer 'r2' uses " + "multioutput='uniform_average').") + assert_warns_message(FutureWarning, msg, reg.score, X, y) + def test_warns_on_get_params_non_attribute(): class MyEstimator(BaseEstimator): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 9f61b12e24ca1..411eb1aaab665 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -21,18 +21,21 @@ from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_no_warnings +from sklearn.exceptions import NotFittedError from sklearn.base import clone, BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union from sklearn.svm import SVC from sklearn.neighbors import LocalOutlierFactor from sklearn.linear_model import LogisticRegression, Lasso from sklearn.linear_model import LinearRegression +from sklearn.multiclass import OneVsRestClassifier from sklearn.cluster import KMeans from sklearn.feature_selection import SelectKBest, f_classif from sklearn.dummy import DummyRegressor from sklearn.decomposition import PCA, TruncatedSVD from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler +from sklearn.impute import SimpleImputer from sklearn.feature_extraction.text import CountVectorizer from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier @@ -1101,6 +1104,141 @@ def test_make_pipeline_memory(): shutil.rmtree(cachedir) +def test_set_input_features(): + pipe = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler()), + ('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]) + assert_raises(NotFittedError, pipe.get_feature_names) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + assert_array_equal(pipe.input_features_, xs) + mask = pipe.named_steps.select.get_support() + assert_array_equal(pipe.named_steps.clf.input_features_, xs[mask]) + res = pipe.get_feature_names(iris.feature_names) + # LogisticRegression doesn't have get_feature_names + assert res is None + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) + # check that empty get_feature_names() doesn't overwrite + res = pipe.get_feature_names() + assert res is None + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) + pipe = Pipeline(steps=[ + ('scaler', StandardScaler()), + ('pca', PCA(n_components=3)), + ('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]) + pipe.fit(iris.data, iris.target) + assert_array_equal(pipe.named_steps.clf.input_features_, ['pca0', 'pca1']) + # setting names doesn't change names after PCA + pipe.get_feature_names(iris.feature_names) + assert_array_equal(pipe.named_steps.select.input_features_, + ['pca0', 'pca1', 'pca2']) + + +def test_input_feature_names_pandas(): + pd = pytest.importorskip("pandas") + pipe = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler()), + ('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]) + iris = load_iris() + df = pd.DataFrame(iris.data, columns=iris.feature_names) + pipe.fit(df, iris.target) + mask = pipe.named_steps.select.get_support() + assert_array_equal(pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) + + +def test_input_features_passthrough(): + pipe = Pipeline(steps=[ + ('imputer', 'passthrough'), + ('scaler', StandardScaler()), + ('select', 'passthrough'), + ('clf', LogisticRegression())]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = ['x0', 'x1', 'x2', 'x3'] + assert_array_equal(pipe.named_steps.clf.input_features_, xs) + pipe.get_feature_names(iris.feature_names) + assert_array_equal(pipe.named_steps.clf.input_features_, + iris.feature_names) + + +def test_input_features_count_vectorizer(): + pipe = Pipeline(steps=[ + ('vect', CountVectorizer()), + ('clf', LogisticRegression())]) + y = ["pizza" in x for x in JUNK_FOOD_DOCS] + pipe.fit(JUNK_FOOD_DOCS, y) + assert_array_equal(pipe.named_steps.clf.input_features_, + ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) + pipe.get_feature_names(["nonsense_is_ignored"]) + assert_array_equal(pipe.named_steps.clf.input_features_, + ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) + + +def test_input_features_nested(): + pipe = Pipeline(steps=[ + ('inner_pipe', Pipeline(steps=[('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]))]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + assert_array_equal(pipe.input_features_, xs) + mask = pipe.named_steps.inner_pipe.named_steps.select.get_support() + assert_array_equal( + pipe.named_steps.inner_pipe.named_steps.clf.input_features_, xs[mask]) + pipe.get_feature_names(iris.feature_names) + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal( + pipe.named_steps.inner_pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) + + +def test_input_features_meta_pipe(): + ovr = OneVsRestClassifier(Pipeline(steps=[('select', SelectKBest(k=2)), + ('clf', LogisticRegression())])) + pipe = Pipeline(steps=[('ovr', ovr)]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + assert_array_equal(pipe.input_features_, xs) + # check 0ths estimator in OVR only + inner_pipe = pipe.named_steps.ovr.estimators_[0] + mask = inner_pipe.named_steps.select.get_support() + assert_array_equal(inner_pipe.named_steps.clf.input_features_, xs[mask]) + pipe.get_feature_names(iris.feature_names) + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(inner_pipe.input_features_, iris.feature_names) + assert_array_equal(inner_pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) + + +def test_input_features_meta(): + ovr = OneVsRestClassifier(LogisticRegression()) + pipe = Pipeline(steps=[('select', SelectKBest(k=2)), ('ovr', ovr)]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + assert_array_equal(pipe.input_features_, xs) + # check 0ths estimator in OVR only + one_logreg = pipe.named_steps.ovr.estimators_[0] + mask = pipe.named_steps.select.get_support() + assert_array_equal(one_logreg.input_features_, xs[mask]) + pipe.get_feature_names(iris.feature_names) + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(one_logreg.input_features_, + np.array(iris.feature_names)[mask]) + + def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises(ValueError, match="Pipeline.fit does not accept " diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index bbde6264a1c77..c93e05e43e0be 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1257,6 +1257,15 @@ def _check_transformer(name, transformer_orig, X, y): transformer_clone = clone(transformer) X_pred = transformer_clone.fit_transform(X, y=y_) + input_features = ['feature%d' % i for i in range(n_features)] + if hasattr(transformer_clone, 'get_feature_names'): + feature_names = transformer_clone.get_feature_names(input_features) + if feature_names is not None: + if isinstance(X_pred, tuple): + assert len(feature_names) == X_pred[0].shape[1] + else: + assert len(feature_names) == X_pred.shape[1] + if isinstance(X_pred, tuple): for x_pred in X_pred: assert x_pred.shape[0] == n_samples