scikit-learn · amueller · Nov 20, 2018 · Nov 20, 2018 · Nov 20, 2018 · Nov 20, 2018
diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
@@ -139,6 +139,32 @@ or by name::
     >>> pipe['reduce_dim']
     PCA()
 
+To enable model inspection, `Pipeline` sets an ``input_features_`` attribute on
+all pipeline steps during fitting. This allows the user to understand how
+features are transformed during a pipeline::
+
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.feature_selection import SelectKBest
+    >>> iris = load_iris()
+    >>> pipe = Pipeline(steps=[
+    ...    ('select', SelectKBest(k=2)),
+    ...    ('clf', LogisticRegression())])
+    >>> pipe.fit(iris.data, iris.target)
+    ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
+    Pipeline(memory=None,
+              steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))])
+    >>> pipe.named_steps.clf.input_features_
+    array(['x2', 'x3'], dtype='<U2')
+
+You can also provide custom feature names for a more human readable format using
+``get_feature_names``::
+
+    >>> pipe.get_feature_names(iris.feature_names)
+    >>> pipe.named_steps.select.input_features_
+    ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
+    >>> pipe.named_steps.clf.input_features_
+    array(['petal length (cm)', 'petal width (cm)'], dtype='<U17')
+
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py`
@@ -428,7 +454,7 @@ By default, the remaining rating columns are ignored (``remainder='drop'``)::
   >>> from sklearn.feature_extraction.text import CountVectorizer
   >>> from sklearn.preprocessing import OneHotEncoder
   >>> column_trans = ColumnTransformer(
-  ...     [('city_category', OneHotEncoder(dtype='int'),['city']),
+  ...     [('categories', OneHotEncoder(dtype='int'),['city']),
   ...      ('title_bow', CountVectorizer(), 'title')],
   ...     remainder='drop')
 
@@ -438,11 +464,11 @@ By default, the remaining rating columns are ignored (``remainder='drop'``)::
                                   ('title_bow', CountVectorizer(), 'title')])
 
   >>> column_trans.get_feature_names()
-  ['city_category__x0_London', 'city_category__x0_Paris', 'city_category__x0_Sallisaw',
-  'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his',
-  'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable',
-  'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson',
-  'title_bow__wrath']
+  ['categories__city_London', 'categories__city_Paris',
+   'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast',
+   'title_bow__grapes', 'title_bow__his', 'title_bow__how', 'title_bow__last',
+   'title_bow__learned', 'title_bow__moveable', 'title_bow__of', 'title_bow__the',
+   'title_bow__trick', 'title_bow__watson', 'title_bow__wrath']
 
   >>> column_trans.transform(X).toarray()
   array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],

diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
@@ -145,6 +145,50 @@
 clf.fit(X_train, y_train)
 print("model score: %.3f" % clf.score(X_test, y_test))
 
+
+###############################################################################
+# Inspecting the coefficients v
6D47
alues of the classifier
+###############################################################################
+# The coefficients of the final classification step of the pipeline gives an
+# idea how each feature impacts the likelihood of survival assuming that the
+# usual linear model assumptions hold (uncorrelated features, linear
+# separability, homoschedastic errors...) which we do not verify in this
+# example.
+#
+# To get error bars we perform cross-validation and compute the mean and
+# standard deviation for each coefficient accross CV splits. Because we use a
+# standard scaler on the numerical features, the coefficient weights gives us
+# an idea on how much the log odds of surviving are impacted by a change in
+# this dimension contrasted to the mean. Note that the categorical features
+# here are overspecified which makes it slightly harder to interpret because of
+# the information redundancy.
+#
+# We can see that the linear model coefficients are in agreement with the
+# historical reports: people in higher classes and therefore in the upper decks
+# were the first to reach the lifeboats, and often, priority was given to women
+# and children.
+#
+# Note that conditionned on the "pclass_x" one-hot features, the "fare"
+# numerical feature does not seem to be significantly predictive. If we drop
+# the "pclass" feature, then higher "fare" values would appear significantly
+# correlated with a higher likelihood of survival as the "fare" and "pclass"
+# features have a strong statistical dependency.
+
+import matplotlib.pyplot as plt
+from sklearn.model_selection import cross_validate
+from sklearn.model_selection import StratifiedShuffleSplit
+
+cv = StratifiedShuffleSplit(n_splits=20, test_size=0.25, random_state=42)
+cv_results = cross_validate(clf, X_train, y_train, cv=cv,
+                            return_estimator=True)
+cv_coefs = np.concatenate([cv_pipeline.named_steps["classifier"].coef_
+                           for cv_pipeline in cv_results["estimator"]])
+fig, ax = plt.subplots()
+ax.barh(clf.named_steps["classifier"].input_features_,
+        cv_coefs.mean(axis=0), xerr=cv_coefs.std(axis=0))
+plt.tight_layout()
+plt.show()
+
 ###############################################################################
 # The resulting score is not exactly the same as the one from the previous
 # pipeline becase the dtype-based selector treats the ``pclass`` columns as

diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py
@@ -9,6 +9,7 @@
 Using a sub-pipeline, the fitted coefficients can be mapped back into
 the original feature space.
 """
+import matplotlib.pyplot as plt
 from sklearn import svm
 from sklearn.datasets import make_classification
 from sklearn.feature_selection import SelectKBest, f_regression
@@ -36,5 +37,7 @@
 y_pred = anova_svm.predict(X_test)
 print(classification_report(y_test, y_pred))
 
-coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_)
-print(coef)
+# access and plot the coefficients of the fitted model
+plt.barh((0, 1, 2), anova_svm[-1].coef_.ravel())
+plt.yticks((0, 1, 2), anova_svm[-1].input_features_)
+plt.show()
diff --git a/sklearn/base.py b/sklearn/base.py
@@ -6,20 +6,23 @@
 import copy
 import warnings
 from collections import defaultdict
+
 import platform
 import inspect
 import re
 
 import numpy as np
 
 from . import __version__
+from .exception import NotFittedError
 from ._config import get_config
 from .utils import _IS_32BIT
 from .utils.validation import check_X_y
 from .utils.validation import check_array
 from .utils._estimator_html_repr import estimator_html_repr
 from .utils.validation import _deprecate_positional_args
 
+
 _DEFAULT_TAGS = {
    'non_deterministic': False,
     'requires_positive_X': False,
@@ -688,6 +691,49 @@ def fit_transform(self, X, y=None, **fit_params):
             # fit method of arity 2 (supervised transformation)
             return self.fit(X, y, **fit_params).transform(X)
 
+    def get_feature_names(self, input_features=None):
+        """Get output feature names.
+
+        Parameters
+        ----------
+        input_features : list of string or None
+            String names of the input features.
+
+        Returns
+        -------
+        output_feature_names : list of string
+            Feature names for transformer output.
+        """
+        # OneToOneMixin is higher in the class hierarchy
+        # because we put mixins on the wrong side
+        if hasattr(super(), 'get_feature_names'):
+            return super().get_feature_names(input_features)
+        # generate feature names from class name by default
+        # would be much less guessing if we stored the number
+        # of output features.
+        # Ideally this would be done in each class.
+        if hasattr(self, 'n_clusters'):
+            # this is before n_components_
+            # because n_components_ means something else
+            # in agglomerative clustering
+            n_features = self.n_clusters
+        elif hasattr(self, '_max_components'):
+            # special case for LinearDiscriminantAnalysis
+            n_components = self.n_components or np.inf
+            n_features = min(self._max_components, n_components)
+        elif hasattr(self, 'n_components_'):
+            # n_components could be auto or None
+            # this is more likely to be an int
+            n_features = self.n_components_
+        elif hasattr(self, 'n_components') and self.n_components is not None:
+            n_features = self.n_components
+        elif hasattr(self, 'components_'):
+            n_features = self.components_.shape[0]
+        else:
+            return None
+        return ["{}{}".format(type(self).__name__.lower(), i)
+                for i in range(n_features)]
+
 
 class DensityMixin:
     """Mixin class for all density estimators in scikit-learn."""
@@ -736,10 +782,81 @@ def fit_predict(self, X, y=None):
         return self.fit(X).predict(X)
 
 
+class OneToOneMixin(object):
+    """Provides get_feature_names for simple transformers
+
+    Assumes there's a 1-to-1 correspondence between input features
+    and output features.
+    """
+
+    def get_feature_names(self, input_features=None):
+        """Get feature names for transformation.
+
+        Returns input_features as this transformation
+        doesn't add or drop features.
+
+        Parameters
+        ----------
+        input_features : array-like of string
+            Input feature names.
+
+        Returns
+        -------
+        feature_names : array-like of string
+            Transformed feature names
+        """
+        if input_features is not None:
+            return input_features
+        else:
+            raise ValueError("Don't know how to get"
+                             " input feature names for {}".format(self))
+
+
+def _get_sub_estimators(est):
+    # Explicitly declare all fitted subestimators of existing meta-estimators
+    sub_ests = []
+    # OHE is not really needed
+    sub_names = ['estimator_', 'base_estimator_', 'one_hot_encoder_',
+                 'best_estimator_', 'init_']
+    for name in sub_names:
+        sub_est = getattr(est, name, None)
+        if sub_est is not None:
+            sub_ests.append(sub_est)
+    if hasattr(est, "estimators_"):
+        if hasattr(est.estimators_, 'shape'):
+            sub_ests.extend(est.estimators_.ravel())
+        else:
+            sub_ests.extend(est.estimators_)
+    return sub_ests
+
+
 class MetaEstimatorMixin:
     _required_parameters = ["estimator"]
     """Mixin class for all meta estimators in scikit-learn."""
 
+    def get_feature_names(self, input_features=None):
+        """Ensure feature names are set on sub-estimators
+
+        Parameters
+        ----------
+        input_features : list of string or None
+            Input features to the meta-estimator.
+        """
+        sub_ests = _get_sub_estimators(self)
+        for est in sub_ests:
+            est.input_features_ = input_features
+            if hasattr(est, "get_feature_names"):
+                # doing hassattr instead of a try-except on everything
+                # b/c catching AttributeError makes recursive code
+                # impossible to debug
+                try:
+                    est.get_feature_names(input_features=input_features)
+                except TypeError:
+                    # do we need this?
+                    est.get_feature_names()
+                except NotFittedError:
+                    pass
+
 
 class MultiOutputMixin:
     """Mixin to mark estimators that support multioutput."""

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -371,8 +371,12 @@ def get_feature_names(self):
                 raise AttributeError("Transformer %s (type %s) does not "
                                      "provide get_feature_names."
                                      % (str(name), type(trans).__name__))
+            try:
+                more_names = trans.get_feature_names(input_features=column)
+            except TypeError:
+                more_names = trans.get_feature_names()
             feature_names.extend([name + "__" + f for f in
-                                  trans.get_feature_names()])
+                                 more_names])
         return feature_names
 
     def _update_fitted_transformers(self, transformers):

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -23,6 +23,7 @@
 from sklearn.preprocessing import FunctionTransformer
 from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder
 from sklearn.feature_extraction import DictVectorizer
+from sklearn.pipeline import make_pipeline
 
 
 class Trans(BaseEstimator):
@@ -659,6 +660,18 @@ def test_column_transformer_get_feature_names():
     assert_raise_message(AttributeError,
                          "Transformer trans (type Trans) does not provide "
                          "get_feature_names", ct.get_feature_names)
+
+    # if some transformers support and some don't
+    ct = ColumnTransformer([('trans', Trans(), [0, 1]),
+                            ('scale', StandardScaler(), [0])])
+    ct.fit(X_array)
+    assert_raise_message(AttributeError,
+                         "Transformer trans (type Trans) does not provide "
+                         "get_feature_names", ct.get_feature_names)
+
+    # inside a pipeline
+    make_pipeline(ct).fit(X_array)
+
 
     # working example
     X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}],

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
@@ -2357,3 +2357,12 @@ def transform(self, X):
         """
         check_is_fitted(self)
         return self.one_hot_encoder_.transform(self.apply(X))
+
+    def get_feature_names(self, input_features=None):
+        """Feature names - not implemented yet.
+
+        Parameters
+        ----------
+        input_features : list of strings or None
+        """
+        return None
diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py
@@ -125,6 +125,21 @@ def inverse_transform(self, X):
         Xt[:, support] = X
         return Xt
 
+    def get_feature_names(self, input_features=None):
+        """Mask feature names according to selected features.
+
+        Parameters
+        ----------
+        input_features : list of string or None
+            Input features to select from. If none, they are generated as
+            x0, x1, ..., xn.
+        """
+        mask = self.get_support()
+        if input_features is None:
+            input_features = ['x%d' % i
+                              for i in range(mask.shape[0])]
+        return np.array(input_features)[mask]
+
 
 def _get_feature_importances(estimator, getter, transform_func=None,
                              norm_order=1):