From ab2acbd29bcb88ef00b039dd75e5a45d1e59c17b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 10:52:05 -0500 Subject: [PATCH 01/54] work on get_feature_names for pipeline --- sklearn/base.py | 15 +++++++++++++++ sklearn/compose/_column_transformer.py | 8 ++++++-- sklearn/impute.py | 4 ++-- sklearn/pipeline.py | 15 +++++++++++++++ sklearn/preprocessing/data.py | 18 +++++++++--------- 5 files changed, 47 insertions(+), 13 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 34998270cea88..b474f774bcf1c 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -509,6 +509,21 @@ def fit_predict(self, X, y=None): return self.fit(X).predict(X) +class OneToOneMixin(object): + """Provides get_feature_names for simple transformers + + Assumes there's a 1-to-1 correspondence between input features + and output features. + """ + + def get_feature_names(self, input_features=None): + if input_features is not None: + return input_features + else: + raise ValueError("Don't know how to get" + " input feature names for {}".format(self)) + + ############################################################################### class MetaEstimatorMixin(object): """Mixin class for all meta estimators in scikit-learn.""" diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 990374c27affe..540b49fc0852b 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -325,7 +325,7 @@ def get_feature_names(self): """ check_is_fitted(self, 'transformers_') feature_names = [] - for name, trans, _, _ in self._iter(fitted=True): + for name, trans, columns, _ in self._iter(fitted=True): if trans == 'drop': continue elif trans == 'passthrough': @@ -336,8 +336,12 @@ def get_feature_names(self): raise AttributeError("Transformer %s (type %s) does not " "provide get_feature_names." % (str(name), type(trans).__name__)) + try: + more_names = trans.get_feature_names(input_features=columns) + except TypeError: + more_names = trans.get_feature_names() feature_names.extend([name + "__" + f for f in - trans.get_feature_names()]) + more_names]) return feature_names def _update_fitted_transformers(self, transformers): diff --git a/sklearn/impute.py b/sklearn/impute.py index a10f6c9eb947f..e55a7a7e19e57 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -11,7 +11,7 @@ from scipy import sparse from scipy import stats -from .base import BaseEstimator, TransformerMixin +from .base import BaseEstimator, TransformerMixin, OneToOneMixin from .utils import check_array from .utils.sparsefuncs import _get_median from .utils.validation import check_is_fitted @@ -90,7 +90,7 @@ def _most_frequent(array, extra_value, n_repeat): return extra_value -class SimpleImputer(BaseEstimator, TransformerMixin): +class SimpleImputer(BaseEstimator, TransformerMixin, OneToOneMixin): """Imputation transformer for completing missing values. Read more in the :ref:`User Guide `. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 3f69f5c18558f..0d370b2283a64 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -531,6 +531,21 @@ def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) + def get_feature_names(self, input_features=None): + feature_names = input_features + with_final = hasattr(self._final_estimator, "get_feature_names") + + for name, transform in self._iter(with_final=with_final): + if not hasattr(transform, "get_feature_names"): + raise TypeError("Transformer {} does provide" + " get_feature_names".format(name)) + try: + feature_names = transform.get_feature_names( + input_features=feature_names) + except TypeError: + feature_names = transform.get_feature_names() + return feature_names + def _name_estimators(estimators): """Generate names for estimators.""" diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 77c2d2cc970fc..d2167b683cdd9 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -19,7 +19,7 @@ from scipy import stats from scipy import optimize -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, OneToOneMixin from ..externals import six from ..utils import check_array from ..utils.extmath import row_norms @@ -199,7 +199,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True): return X -class MinMaxScaler(BaseEstimator, TransformerMixin): +class MinMaxScaler(BaseEstimator, TransformerMixin, OneToOneMixin): """Transforms features by scaling each feature to a given range. This estimator scales and translates each feature individually such @@ -477,7 +477,7 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True): return X -class StandardScaler(BaseEstimator, TransformerMixin): +class StandardScaler(BaseEstimator, TransformerMixin, OneToOneMixin): """Standardize features by removing the mean and scaling to unit variance The standard score of a sample `x` is calculated as: @@ -798,7 +798,7 @@ def inverse_transform(self, X, copy=None): return X -class MaxAbsScaler(BaseEstimator, TransformerMixin): +class MaxAbsScaler(BaseEstimator, TransformerMixin, OneToOneMixin): """Scale each feature by its maximum absolute value. This estimator scales and translates each feature individually such @@ -1024,7 +1024,7 @@ def maxabs_scale(X, axis=0, copy=True): return X -class RobustScaler(BaseEstimator, TransformerMixin): +class RobustScaler(BaseEstimator, TransformerMixin, OneToOneMixin): """Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to @@ -1619,7 +1619,7 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False): return X -class Normalizer(BaseEstimator, TransformerMixin): +class Normalizer(BaseEstimator, TransformerMixin, OneToOneMixin): """Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one @@ -1754,7 +1754,7 @@ def binarize(X, threshold=0.0, copy=True): return X -class Binarizer(BaseEstimator, TransformerMixin): +class Binarizer(BaseEstimator, TransformerMixin, OneToOneMixin): """Binarize data (set feature values to 0 or 1) according to a threshold Values greater than the threshold map to 1, while values less than @@ -1988,7 +1988,7 @@ def add_dummy_feature(X, value=1.0): return np.hstack((np.full((n_samples, 1), value), X)) -class QuantileTransformer(BaseEstimator, TransformerMixin): +class QuantileTransformer(BaseEstimator, TransformerMixin, OneToOneMixin): """Transform features using quantiles information. This method transforms the features to follow a uniform or a normal @@ -2488,7 +2488,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, " axis={}".format(axis)) -class PowerTransformer(BaseEstimator, TransformerMixin): +class PowerTransformer(BaseEstimator, TransformerMixin, OneToOneMixin): """Apply a power transform featurewise to make data more Gaussian-like. Power transforms are a family of parametric, monotonic transformations From 3bc674b5cd24b203d04a35ca3ae552fcd8e094b9 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 13:27:20 -0500 Subject: [PATCH 02/54] fix SimpleImputer get_feature_names --- sklearn/impute.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index e55a7a7e19e57..d16e7479dd3a4 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -11,7 +11,7 @@ from scipy import sparse from scipy import stats -from .base import BaseEstimator, TransformerMixin, OneToOneMixin +from .base import BaseEstimator, TransformerMixin from .utils import check_array from .utils.sparsefuncs import _get_median from .utils.validation import check_is_fitted @@ -90,7 +90,7 @@ def _most_frequent(array, extra_value, n_repeat): return extra_value -class SimpleImputer(BaseEstimator, TransformerMixin, OneToOneMixin): +class SimpleImputer(BaseEstimator, TransformerMixin): """Imputation transformer for completing missing values. Read more in the :ref:`User Guide `. @@ -257,7 +257,8 @@ def fit(self, X, y=None): self.strategy, self.missing_values, fill_value) - + invalid_mask = _get_mask(self.statistics_, np.nan) + self._valid_mask = np.logical_not(invalid_mask) return self def _sparse_fit(self, X, strategy, missing_values, fill_value): @@ -373,8 +374,8 @@ def transform(self, X): valid_statistics = statistics else: # same as np.isnan but also works for object dtypes - invalid_mask = _get_mask(statistics, np.nan) - valid_mask = np.logical_not(invalid_mask) + valid_mask = self._valid_mask + invalid_mask = np.logical_not(valid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.flatnonzero(valid_mask) @@ -408,6 +409,11 @@ def transform(self, X): return X + def get_feature_names(self, input_features=None): + if input_features is None: + raise TypeError("Don't have input_features") + return np.array(input_features)[self._valid_mask] + class MissingIndicator(BaseEstimator, TransformerMixin): """Binary indicators for missing values. From 1c4a78f976f4498920c0c5de3530d9212f728796 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 15:01:49 -0500 Subject: [PATCH 03/54] use hasattr(transform) to check whether to use final estimator in get_feature_names --- sklearn/pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 0d370b2283a64..6388d4eea1844 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -533,8 +533,7 @@ def _pairwise(self): def get_feature_names(self, input_features=None): feature_names = input_features - with_final = hasattr(self._final_estimator, "get_feature_names") - + with_final = hasattr(self._final_estimator, "transform") for name, transform in self._iter(with_final=with_final): if not hasattr(transform, "get_feature_names"): raise TypeError("Transformer {} does provide" From 788193061f5c233cdf707c5c281cca40be5f47a7 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 15:39:56 -0500 Subject: [PATCH 04/54] add some docstrings --- sklearn/base.py | 15 +++++++++++++++ sklearn/impute.py | 12 ++++++++++++ sklearn/pipeline.py | 16 ++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/sklearn/base.py b/sklearn/base.py index b474f774bcf1c..1629fac63503e 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -517,6 +517,21 @@ class OneToOneMixin(object): """ def get_feature_names(self, input_features=None): + """Get feature names for transformation. + + Returns input_features as this transformation + doesn't add or drop features. + + Parameters + ---------- + input_feature : array-like of string + Input feature names. + + Returns + ------- + feature_names : array-like of string + Transformed feature names + """ if input_features is not None: return input_features else: diff --git a/sklearn/impute.py b/sklearn/impute.py index d16e7479dd3a4..5b23ab8f866a8 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -410,6 +410,18 @@ def transform(self, X): return X def get_feature_names(self, input_features=None): + """Get feature names for transformation. + + Parameters + ---------- + input_feature : array-like of string + Input feature names. + + Returns + ------- + feature_names : array-like of string + Transformed feature names + """ if input_features is None: raise TypeError("Don't have input_features") return np.array(input_features)[self._valid_mask] diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 6388d4eea1844..64e37e29b0365 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -532,6 +532,22 @@ def _pairwise(self): return getattr(self.steps[0][1], '_pairwise', False) def get_feature_names(self, input_features=None): + """Get feature names for transformation. + + Transform input features using the pipeline. + If the last step is a transformer, it's included + in the transformation, otherwise it's not. + + Parameters + ---------- + input_feature : array-like of string + Input feature names. + + Returns + ------- + feature_names : array-like of string + Transformed feature names + """ feature_names = input_features with_final = hasattr(self._final_estimator, "transform") for name, transform in self._iter(with_final=with_final): From de63353cd8e48d2dd14194c28ad33bf23053147f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 27 Nov 2018 15:33:51 -0500 Subject: [PATCH 05/54] fix docstring --- sklearn/base.py | 2 +- sklearn/impute.py | 2 +- sklearn/pipeline.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 1629fac63503e..392eb6ed60573 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -524,7 +524,7 @@ def get_feature_names(self, input_features=None): Parameters ---------- - input_feature : array-like of string + input_features : array-like of string Input feature names. Returns diff --git a/sklearn/impute.py b/sklearn/impute.py index 5b23ab8f866a8..c2368731e148e 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -414,7 +414,7 @@ def get_feature_names(self, input_features=None): Parameters ---------- - input_feature : array-like of string + input_features : array-like of string Input feature names. Returns diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 64e37e29b0365..3fb3119941491 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -540,7 +540,7 @@ def get_feature_names(self, input_features=None): Parameters ---------- - input_feature : array-like of string + input_features : array-like of string Input feature names. Returns From 6ca8b0360c02d71f899cec12380ffec0b1fcb983 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 13:49:59 +0100 Subject: [PATCH 06/54] add set_feature_names to pipeline, remove hack in pipeline.get_feature_names --- sklearn/pipeline.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index eba2d42abe7fd..b2bac34ae08ae 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -246,6 +246,10 @@ def _fit(self, X, y=None, **fit_params): # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) + + if hasattr(X, 'columns'): + self.set_feature_names(X.columns) + if self._final_estimator == 'passthrough': return Xt, {} return Xt, fit_params_steps[self.steps[-1][0]] @@ -529,6 +533,20 @@ def classes_(self): def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) + + def set_feature_names(self, input_features): + self.input_features_ = input_features + feature_names = input_features + for name, transform in self._iter(with_final=True): + transform.input_features_ = feature_names + if not hasattr(transform, "get_feature_names"): + raise TypeError("Transformer {} does provide" + " get_feature_names".format(name)) + try: + feature_names = transform.get_feature_names( + input_features=feature_names) + except TypeError: + feature_names = transform.get_feature_names() def get_feature_names(self, input_features=None): """Get feature names for transformation. @@ -548,8 +566,7 @@ def get_feature_names(self, input_features=None): Transformed feature names """ feature_names = input_features - with_final = hasattr(self._final_estimator, "transform") - for name, transform in self._iter(with_final=with_final): + for name, transform in self._iter(with_final=True): if not hasattr(transform, "get_feature_names"): raise TypeError("Transformer {} does provide" " get_feature_names".format(name)) From ddd03413c010415b9806d519cf9ac3f730e41827 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 14:08:15 +0100 Subject: [PATCH 07/54] fix to use new _iter, deal with last transformer --- sklearn/pipeline.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index b2bac34ae08ae..1b2dd98d2180c 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -247,9 +247,6 @@ def _fit(self, X, y=None, **fit_params): # from the cache. self.steps[step_idx] = (name, fitted_transformer) - if hasattr(X, 'columns'): - self.set_feature_names(X.columns) - if self._final_estimator == 'passthrough': return Xt, {} return Xt, fit_params_steps[self.steps[-1][0]] @@ -283,6 +280,10 @@ def fit(self, X, y=None, **fit_params): Xt, fit_params = self._fit(X, y, **fit_params) if self._final_estimator != 'passthrough': self._final_estimator.fit(Xt, y, **fit_params) + + if hasattr(X, 'columns'): + self.set_feature_names(X.columns) + return self def fit_transform(self, X, y=None, **fit_params): @@ -315,11 +316,14 @@ def fit_transform(self, X, y=None, **fit_params): last_step = self._final_estimator Xt, fit_params = self._fit(X, y, **fit_params) if hasattr(last_step, 'fit_transform'): - return last_step.fit_transform(Xt, y, **fit_params) - elif last_step == 'passthrough': - return Xt - else: - return last_step.fit(Xt, y, **fit_params).transform(Xt) + Xt = last_step.fit_transform(Xt, y, **fit_params) + elif last_step != 'passthrough': + Xt = last_step.fit(Xt, y, **fit_params).transform(Xt) + + if hasattr(X, 'columns'): + self.set_feature_names(X.columns) + + return Xt @if_delegate_has_method(delegate='_final_estimator') def predict(self, X, **predict_params): @@ -533,11 +537,11 @@ def classes_(self): def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) - + def set_feature_names(self, input_features): self.input_features_ = input_features feature_names = input_features - for name, transform in self._iter(with_final=True): + for _, name, transform in self._iter(with_final=False): transform.input_features_ = feature_names if not hasattr(transform, "get_feature_names"): raise TypeError("Transformer {} does provide" @@ -547,6 +551,7 @@ def set_feature_names(self, input_features): input_features=feature_names) except TypeError: feature_names = transform.get_feature_names() + self._final_estimator.input_features_ = feature_names def get_feature_names(self, input_features=None): """Get feature names for transformation. @@ -566,7 +571,7 @@ def get_feature_names(self, input_features=None): Transformed feature names """ feature_names = input_features - for name, transform in self._iter(with_final=True): + for _, name, transform in self._iter(with_final=True): if not hasattr(transform, "get_feature_names"): raise TypeError("Transformer {} does provide" " get_feature_names".format(name)) From ba053acdb619ea11c1c4831eeb25746ac715fa74 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 15:12:36 +0100 Subject: [PATCH 08/54] always call generation of feature names, generate if X has none. --- sklearn/impute.py | 3 ++- sklearn/pipeline.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 95dfe046537a3..a201db198c384 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -437,7 +437,8 @@ def get_feature_names(self, input_features=None): Transformed feature names """ if input_features is None: - raise TypeError("Don't have input_features") + input_features = ['x%d' % i + for i in range(self.statistics_.shape[0])] return np.array(input_features)[self._valid_mask] def _more_tags(self): diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 1b2dd98d2180c..847760c64dc61 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -26,6 +26,17 @@ __all__ = ['Pipeline', 'FeatureUnion', 'make_pipeline', 'make_union'] +def _get_feature_names(X): + if hasattr(X, 'columns'): + feature_names = X.columns + elif getattr(X, 'ndim', 0) > 1: + feature_names = getattr(X, 'columns', + ['x%d' % i for i in range(X.shape[1])]) + else: + feature_names = None + return feature_names + + class Pipeline(_BaseComposition): """Pipeline of transforms with a final estimator. @@ -280,9 +291,7 @@ def fit(self, X, y=None, **fit_params): Xt, fit_params = self._fit(X, y, **fit_params) if self._final_estimator != 'passthrough': self._final_estimator.fit(Xt, y, **fit_params) - - if hasattr(X, 'columns'): - self.set_feature_names(X.columns) + self.set_feature_names(_get_feature_names(X)) return self @@ -320,8 +329,7 @@ def fit_transform(self, X, y=None, **fit_params): elif last_step != 'passthrough': Xt = last_step.fit(Xt, y, **fit_params).transform(Xt) - if hasattr(X, 'columns'): - self.set_feature_names(X.columns) + self.set_feature_names(_get_feature_names(X)) return Xt @@ -539,6 +547,20 @@ def _pairwise(self): return getattr(self.steps[0][1], '_pairwise', False) def set_feature_names(self, input_features): + """Set the input feature names for all steps. + + Sets the input_features_ attribute on the pipeline and + on all pipeline steps using the provided input feature names + as input for the first step. + + Some estimators like `ColumnTransformer` and `CountVectorizer` + might ignore the provided input feature names. + + Parameters + ---------- + + + """ self.input_features_ = input_features feature_names = input_features for _, name, transform in self._iter(with_final=False): From 5da22070f98fe614edee20fec2c5bafacb0c0ac4 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 15:28:29 +0100 Subject: [PATCH 09/54] add get_feature_names to feature selection estimators --- sklearn/feature_selection/base.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/feature_selection/base.py b/sklearn/feature_selection/base.py index 5add330188f78..3a92a0145a718 100644 --- a/sklearn/feature_selection/base.py +++ b/sklearn/feature_selection/base.py @@ -119,3 +119,10 @@ def inverse_transform(self, X): Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype) Xt[:, support] = X return Xt + + def get_feature_names(self, input_features=None): + mask = self.get_support() + if input_features is None: + input_features = ['x%d' % i + for i in range(mask.shape[0])] + return np.array(input_features)[mask] From 58d65b1ff8145207c4118e0d2cfe171aec1adf32 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 15:28:40 +0100 Subject: [PATCH 10/54] add basic test for input features in pipeline --- sklearn/tests/test_pipeline.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 259876acd1a42..db7a77a5feaf9 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -30,6 +30,7 @@ from sklearn.decomposition import PCA, TruncatedSVD from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler +from sklearn.impute import SimpleImputer from sklearn.feature_extraction.text import CountVectorizer from sklearn.utils._joblib import Memory from sklearn.utils._joblib import __version__ as joblib_version @@ -1048,3 +1049,29 @@ def test_make_pipeline_memory(): assert pipeline.memory is None shutil.rmtree(cachedir) + + +def test_input_feature_names_pandas(): + pass + + +def test_set_input_features(): + pipe = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler()), + ('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + assert_array_equal(pipe.input_features_, xs) + mask = pipe.named_steps.select.get_support() + assert_array_equal(pipe.named_steps.clf.input_features_, xs[mask]) + pipe.set_feature_names(iris.feature_names) + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) + + +def test_input_features_count_vectorizer(): + pass From 8026d8d9650d9529a102e16e928932046602a695 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 15:30:11 +0100 Subject: [PATCH 11/54] pep8, fixup docstring --- sklearn/feature_selection/base.py | 2 +- sklearn/pipeline.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/feature_selection/base.py b/sklearn/feature_selection/base.py index 3a92a0145a718..e03102989b6a0 100644 --- a/sklearn/feature_selection/base.py +++ b/sklearn/feature_selection/base.py @@ -119,7 +119,7 @@ def inverse_transform(self, X): Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype) Xt[:, support] = X return Xt - + def get_feature_names(self, input_features=None): mask = self.get_support() if input_features is None: diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 847760c64dc61..c77074711bcbd 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -548,18 +548,20 @@ def _pairwise(self): def set_feature_names(self, input_features): """Set the input feature names for all steps. - + Sets the input_features_ attribute on the pipeline and on all pipeline steps using the provided input feature names as input for the first step. - + Some estimators like `ColumnTransformer` and `CountVectorizer` might ignore the provided input feature names. - + Parameters ---------- - - + input_features : array-like of string or None + Feature names to use as input feature names for the first step + of the pipeline. + """ self.input_features_ = input_features feature_names = input_features From 6a61ed9dcd5ff004ce72394d1fa22477c8b57512 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 15:37:23 +0100 Subject: [PATCH 12/54] add test for count vectorizer --- sklearn/tests/test_pipeline.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index db7a77a5feaf9..f54416e939e33 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1072,6 +1072,19 @@ def test_set_input_features(): assert_array_equal(pipe.named_steps.clf.input_features_, np.array(iris.feature_names)[mask]) - -def test_input_features_count_vectorizer(): + +def test_input_features_passthrough(): pass + + +def test_input_features_count_vectorizer(): + pipe = Pipeline(steps=[ + ('vect', CountVectorizer()), + ('clf', LogisticRegression())]) + y = ["pizza" in x for x in JUNK_FOOD_DOCS] + pipe.fit(JUNK_FOOD_DOCS, y) + assert_array_equal(pipe.named_steps.clf.input_features_, + ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) + pipe.set_feature_names(["nonsense_is_ignored"]) + assert_array_equal(pipe.named_steps.clf.input_features_, + ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) \ No newline at end of file From e0c0a5400208f62b1e9007064d4a5cfc6344099d Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 15:39:59 +0100 Subject: [PATCH 13/54] add test for passthrough --- sklearn/tests/test_pipeline.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index f54416e939e33..b7861743b7126 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1074,7 +1074,18 @@ def test_set_input_features(): def test_input_features_passthrough(): - pass + pipe = Pipeline(steps=[ + ('imputer', 'passthrough'), + ('scaler', StandardScaler()), + ('select', 'passthrough'), + ('clf', LogisticRegression())]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = ['x0', 'x1', 'x2', 'x3'] + assert_array_equal(pipe.named_steps.clf.input_features_, xs) + pipe.set_feature_names(iris.feature_names) + assert_array_equal(pipe.named_steps.clf.input_features_, + iris.feature_names) def test_input_features_count_vectorizer(): From 968163b3657a900571e6a46794efd7c09d86f3a1 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 15:43:52 +0100 Subject: [PATCH 14/54] add tests for pandas feature names --- sklearn/tests/test_pipeline.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index b7861743b7126..86df228f85abd 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1049,12 +1049,8 @@ def test_make_pipeline_memory(): assert pipeline.memory is None shutil.rmtree(cachedir) - -def test_input_feature_names_pandas(): - pass - def test_set_input_features(): pipe = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), @@ -1073,6 +1069,20 @@ def test_set_input_features(): np.array(iris.feature_names)[mask]) +def test_input_feature_names_pandas(): + pd = pytest.importorskip("pandas") + pipe = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler()), + ('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]) + iris = load_iris() + df = pd.DataFrame(iris.data, names=iris.feature_names) + pipe.fit(df, iris.target) + assert_array_equal(pipe.named_steps.clf.input_features_, + iris.feature_names) + + def test_input_features_passthrough(): pipe = Pipeline(steps=[ ('imputer', 'passthrough'), @@ -1098,4 +1108,4 @@ def test_input_features_count_vectorizer(): ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) pipe.set_feature_names(["nonsense_is_ignored"]) assert_array_equal(pipe.named_steps.clf.input_features_, - ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) \ No newline at end of file + ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) From 3fd5f6dd7ab0a7da60baf42f32aa6d650bfdfbfe Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 16:02:27 +0100 Subject: [PATCH 15/54] add feature plot with feature names to pipeline anova example --- .../feature_selection/plot_feature_selection_pipeline.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py index c4b61990ef6e5..5b470af376535 100644 --- a/examples/feature_selection/plot_feature_selection_pipeline.py +++ b/examples/feature_selection/plot_feature_selection_pipeline.py @@ -6,6 +6,7 @@ Simple usage of Pipeline that runs successively a univariate feature selection with anova and then a C-SVM of the selected features. """ +import matplotlib.pyplot as plt from sklearn import svm from sklearn.datasets import samples_generator from sklearn.feature_selection import SelectKBest, f_regression @@ -17,7 +18,7 @@ # import some data to play with X, y = samples_generator.make_classification( - n_features=20, n_informative=3, n_redundant=0, n_classes=4, + n_features=20, n_informative=3, n_redundant=0, n_classes=2, n_clusters_per_class=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) @@ -32,3 +33,8 @@ anova_svm.fit(X_train, y_train) y_pred = anova_svm.predict(X_test) print(classification_report(y_test, y_pred)) + +# access and plot the coefficients of the fitted model +plt.bar((0, 1, 2), anova_svm.named_steps.svc.coef_.ravel()) +plt.xticks((0, 1, 2), anova_svm.named_steps.svc.input_features_) +plt.show() From d7c66e1dad99619a96001008a13f7d37305ddaaf Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 27 Feb 2019 16:03:49 +0100 Subject: [PATCH 16/54] Improve the titanic column transformer example --- .../plot_column_transformer_mixed_types.py | 46 +++++++++++++++++-- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 45898fe27e911..ef000c4a0077e 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -68,16 +68,52 @@ # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. -clf = Pipeline(steps=[('preprocessor', preprocessor), - ('classifier', LogisticRegression(solver='lbfgs'))]) +pipeline = Pipeline(steps=[('preprocessor', preprocessor), + ('classifier', LogisticRegression(solver='lbfgs'))]) X = data.drop('survived', axis=1) y = data['survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) -clf.fit(X_train, y_train) -print("model score: %.3f" % clf.score(X_test, y_test)) +pipeline.fit(X_train, y_train) +print("model score: %.3f" % pipeline.score(X_test, y_test)) + + +############################################################################### +# Introspecting the coefficients values of the classifier +############################################################################### +# The coefficients of the final classification step of the pipeline gives an +# idea how each feature impacts the likelihood of survival assuming that the +# usual linear model assumptions hold (uncorrelated features, linear +# separability, homoschedastic and normally distributed errors...) which we do +# not verify in this example. +# +# To get error bars we perform cross-validation and compute the mean and +# standard deviation for each coefficient accross CV splits. Because we use a +# standard scaler on the numerical features, the coefficient weights gives us +# an idea on how much the log odds of surviving are impacted by a change in +# this dimension contrasted to the mean. Note that the categorical features +# here are overspecified which makes it slightly harder to interpret because of +# the information redundancy. +# +# We can see that the linear model coefficients are in agreement with the +# historical reports: people in higher classes and therefore in the upper decks +# were first to access the lifeboats, and often, priority was given to women +# and children. + +import matplotlib.pyplot as plt +from sklearn.model_selection import cross_validate + +cv_results = cross_validate(pipeline, X_train, y_train, cv=10, + return_estimator=True) +cv_coefs = np.concatenate([cv_pipeline.named_steps["classifier"].coef_ + for cv_pipeline in cv_results["estimator"]]) +fig, ax = plt.subplots() +ax.barh(pipeline.named_steps["classifier"].input_features_, + cv_coefs.mean(axis=0), xerr=cv_coefs.std(axis=0)) +plt.tight_layout() +plt.show() ############################################################################### @@ -96,7 +132,7 @@ 'classifier__C': [0.1, 1.0, 10, 100], } -grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False) +grid_search = GridSearchCV(pipeline, param_grid, cv=10, iid=False) grid_search.fit(X_train, y_train) print(("best logistic regression from grid search: %.3f" From b3308417ce6867811a483876d373316206ad0638 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 17:27:58 +0100 Subject: [PATCH 17/54] don't error when get_feature_names is not available in pipeline --- sklearn/pipeline.py | 8 ++++---- sklearn/tests/test_pipeline.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index c77074711bcbd..e9562926b349a 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -567,15 +567,15 @@ def set_feature_names(self, input_features): feature_names = input_features for _, name, transform in self._iter(with_final=False): transform.input_features_ = feature_names - if not hasattr(transform, "get_feature_names"): - raise TypeError("Transformer {} does provide" - " get_feature_names".format(name)) try: feature_names = transform.get_feature_names( input_features=feature_names) except TypeError: feature_names = transform.get_feature_names() - self._final_estimator.input_features_ = feature_names + except AttributeError: + feature_names = None + if self._final_estimator != "passthrough": + self._final_estimator.input_features_ = feature_names def get_feature_names(self, input_features=None): """Get feature names for transformation. diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 86df228f85abd..c2ffcb2b0e703 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1077,10 +1077,11 @@ def test_input_feature_names_pandas(): ('select', SelectKBest(k=2)), ('clf', LogisticRegression())]) iris = load_iris() - df = pd.DataFrame(iris.data, names=iris.feature_names) + df = pd.DataFrame(iris.data, columns=iris.feature_names) pipe.fit(df, iris.target) + mask = pipe.named_steps.select.get_support() assert_array_equal(pipe.named_steps.clf.input_features_, - iris.feature_names) + np.array(iris.feature_names)[mask]) def test_input_features_passthrough(): From 8da4ebde7aa2c46a48b953c37936524eb38c3787 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 17:28:25 +0100 Subject: [PATCH 18/54] start on user guide for input_features_ --- doc/modules/compose.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 0145842b88e16..74ea996fd164c 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -115,6 +115,10 @@ ignored by setting them to ``'passthrough'``:: ... clf__C=[0.1, 10, 100]) >>> grid_search = GridSearchCV(pipe, param_grid=param_grid) +To enable model inspection, `Pipeline` sets a ``input_features_`` attribute on +all pipeline steps during fitting. This allows the user to understand how +features are transformed during a pipeline: + .. topic:: Examples: * :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py` From 372eb7180dc346d5ce829a869c25c2bc6fe6d47c Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 18:00:26 +0100 Subject: [PATCH 19/54] Add example for input_features_ in pipeline userguide --- doc/modules/compose.rst | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 74ea996fd164c..af84a1e95b4e6 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -115,9 +115,32 @@ ignored by setting them to ``'passthrough'``:: ... clf__C=[0.1, 10, 100]) >>> grid_search = GridSearchCV(pipe, param_grid=param_grid) + To enable model inspection, `Pipeline` sets a ``input_features_`` attribute on all pipeline steps during fitting. This allows the user to understand how -features are transformed during a pipeline: +features are transformed during a pipeline:: + + >>> from sklearn.datasets import load_iris + >>> from sklearn.feature_selection import SelectKBest + >>> iris = load_iris() + >>> pipe = Pipeline(steps=[ + ... ('select', SelectKBest(k=2)), + ... ('clf', LogisticRegression())]) + >>> pipe.fit(iris.data, iris.target) + ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + Pipeline(memory=None, + steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))]) + >>> pipe.named_steps.clf.input_features_ + array(['x2', 'x3'], dtype='>> pipe.set_feature_names(iris.feature_names) + >>> pipe.named_steps.select.input_features_ + ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] + >>> pipe.named_steps.clf.input_features_ + array(['petal length (cm)', 'petal width (cm)'], dtype='>> column_trans.get_feature_names() ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS - ['city_category__x0_London', 'city_category__x0_Paris', 'city_category__x0_Sallisaw', - 'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his', - 'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable', - 'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson', - 'title_bow__wrath'] + ['city_category__city_London', 'city_category__city_Paris', 'city_category__city_Sallisaw', + 'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his', + 'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable', + 'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson', + 'title_bow__wrath'] >>> column_trans.transform(X).toarray() ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS From 66eb4e6667e54cc261a366409980574160ead3d6 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 18:10:00 +0100 Subject: [PATCH 20/54] use self.input_features_ in get_feature_names if available. --- sklearn/pipeline.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index e9562926b349a..d6644e67e7f79 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -594,6 +594,9 @@ def get_feature_names(self, input_features=None): feature_names : array-like of string Transformed feature names """ + if input_features is None and hasattr(self, 'input_features_'): + input_features = self.input_features_ + feature_names = input_features for _, name, transform in self._iter(with_final=True): if not hasattr(transform, "get_feature_names"): From 0d8dc704801b1f6c85fab902cf12351e6b72243e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 27 Feb 2019 18:20:25 +0100 Subject: [PATCH 21/54] ignore logreg deprecations --- sklearn/tests/test_pipeline.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index c2ffcb2b0e703..302e87380c80a 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1051,6 +1051,7 @@ def test_make_pipeline_memory(): shutil.rmtree(cachedir) +@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 def test_set_input_features(): pipe = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), @@ -1069,6 +1070,7 @@ def test_set_input_features(): np.array(iris.feature_names)[mask]) +@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 def test_input_feature_names_pandas(): pd = pytest.importorskip("pandas") pipe = Pipeline(steps=[ @@ -1084,6 +1086,7 @@ def test_input_feature_names_pandas(): np.array(iris.feature_names)[mask]) +@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 def test_input_features_passthrough(): pipe = Pipeline(steps=[ ('imputer', 'passthrough'), @@ -1099,6 +1102,7 @@ def test_input_features_passthrough(): iris.feature_names) +@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 def test_input_features_count_vectorizer(): pipe = Pipeline(steps=[ ('vect', CountVectorizer()), From 7550aacbf2ccb4450a400d0809fc163fbb922b22 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 13:49:19 +0100 Subject: [PATCH 22/54] remove set_feature_names, reuse get_feature_names Add more test. General meta-estimators not working yet. --- doc/modules/compose.rst | 4 ++-- sklearn/pipeline.py | 27 +++++---------------- sklearn/tests/test_pipeline.py | 44 +++++++++++++++++++++++++++++++--- 3 files changed, 49 insertions(+), 26 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index af84a1e95b4e6..a7cde459aea1e 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -134,9 +134,9 @@ features are transformed during a pipeline:: array(['x2', 'x3'], dtype='>> pipe.set_feature_names(iris.feature_names) + >>> pipe.get_feature_names(iris.feature_names) >>> pipe.named_steps.select.input_features_ ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] >>> pipe.named_steps.clf.input_features_ diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index d6644e67e7f79..30688facb6f36 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -291,7 +291,7 @@ def fit(self, X, y=None, **fit_params): Xt, fit_params = self._fit(X, y, **fit_params) if self._final_estimator != 'passthrough': self._final_estimator.fit(Xt, y, **fit_params) - self.set_feature_names(_get_feature_names(X)) + self.get_feature_names(_get_feature_names(X)) return self @@ -329,7 +329,7 @@ def fit_transform(self, X, y=None, **fit_params): elif last_step != 'passthrough': Xt = last_step.fit(Xt, y, **fit_params).transform(Xt) - self.set_feature_names(_get_feature_names(X)) + self.get_feature_names(_get_feature_names(X)) return Xt @@ -546,7 +546,7 @@ def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) - def set_feature_names(self, input_features): + def get_feature_names(self, input_features): """Set the input feature names for all steps. Sets the input_features_ attribute on the pipeline and @@ -565,7 +565,7 @@ def set_feature_names(self, input_features): """ self.input_features_ = input_features feature_names = input_features - for _, name, transform in self._iter(with_final=False): + for _, name, transform in self._iter(with_final=True): transform.input_features_ = feature_names try: feature_names = transform.get_feature_names( @@ -574,10 +574,9 @@ def set_feature_names(self, input_features): feature_names = transform.get_feature_names() except AttributeError: feature_names = None - if self._final_estimator != "passthrough": - self._final_estimator.input_features_ = feature_names + return feature_names - def get_feature_names(self, input_features=None): + # def get_feature_names(self, input_features=None): """Get feature names for transformation. Transform input features using the pipeline. @@ -594,20 +593,6 @@ def get_feature_names(self, input_features=None): feature_names : array-like of string Transformed feature names """ - if input_features is None and hasattr(self, 'input_features_'): - input_features = self.input_features_ - - feature_names = input_features - for _, name, transform in self._iter(with_final=True): - if not hasattr(transform, "get_feature_names"): - raise TypeError("Transformer {} does provide" - " get_feature_names".format(name)) - try: - feature_names = transform.get_feature_names( - input_features=feature_names) - except TypeError: - feature_names = transform.get_feature_names() - return feature_names def _name_estimators(estimators): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 302e87380c80a..86cc07a9caf89 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -24,6 +24,7 @@ from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression, Lasso from sklearn.linear_model import LinearRegression +from sklearn.multiclass import OneVsRestClassifier from sklearn.cluster import KMeans from sklearn.feature_selection import SelectKBest, f_classif from sklearn.dummy import DummyRegressor @@ -1064,7 +1065,9 @@ def test_set_input_features(): assert_array_equal(pipe.input_features_, xs) mask = pipe.named_steps.select.get_support() assert_array_equal(pipe.named_steps.clf.input_features_, xs[mask]) - pipe.set_feature_names(iris.feature_names) + res = pipe.get_feature_names(iris.feature_names) + # LogisticRegression doesn't have get_feature_names + assert res is None assert_array_equal(pipe.input_features_, iris.feature_names) assert_array_equal(pipe.named_steps.clf.input_features_, np.array(iris.feature_names)[mask]) @@ -1097,7 +1100,7 @@ def test_input_features_passthrough(): pipe.fit(iris.data, iris.target) xs = ['x0', 'x1', 'x2', 'x3'] assert_array_equal(pipe.named_steps.clf.input_features_, xs) - pipe.set_feature_names(iris.feature_names) + pipe.get_feature_names(iris.feature_names) assert_array_equal(pipe.named_steps.clf.input_features_, iris.feature_names) @@ -1111,6 +1114,41 @@ def test_input_features_count_vectorizer(): pipe.fit(JUNK_FOOD_DOCS, y) assert_array_equal(pipe.named_steps.clf.input_features_, ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) - pipe.set_feature_names(["nonsense_is_ignored"]) + pipe.get_feature_names(["nonsense_is_ignored"]) assert_array_equal(pipe.named_steps.clf.input_features_, ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) + +@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 +def test_input_features_nested(): + pipe = Pipeline(steps=[ + ('inner_pipe', Pipeline(steps=[('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]))]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + assert_array_equal(pipe.input_features_, xs) + mask = pipe.named_steps.inner_pipe.named_steps.select.get_support() + assert_array_equal(pipe.named_steps.inner_pipe.named_steps.clf.input_features_, xs[mask]) + pipe.get_feature_names(iris.feature_names) + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(pipe.named_steps.inner_pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) + + +@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 +def test_input_features_meta(): + ovr = OneVsRestClassifier(Pipeline(steps=[('select', SelectKBest(k=2)), + ('clf', LogisticRegression())])) + pipe = Pipeline(steps=[('ovr', ovr)]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + assert_array_equal(pipe.input_features_, xs) + # check 0ths estimator in OVR only + inner_pipe = pipe.named_steps.ovr.estimators_[0] + mask = inner_pipe.named_steps.select.get_support() + assert_array_equal(inner_pipe.named_steps.clf.input_features_, xs[mask]) + pipe.get_feature_names(iris.feature_names) + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(inner_pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) \ No newline at end of file From 4287cb843baa7a93284a54ab6985f42438a205ca Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 14:19:07 +0100 Subject: [PATCH 23/54] slightly easier to debug get_feature_names recursion, better test --- sklearn/pipeline.py | 16 ++++++++++------ sklearn/tests/test_pipeline.py | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 30688facb6f36..48a4baa33c307 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -567,12 +567,16 @@ def get_feature_names(self, input_features): feature_names = input_features for _, name, transform in self._iter(with_final=True): transform.input_features_ = feature_names - try: - feature_names = transform.get_feature_names( - input_features=feature_names) - except TypeError: - feature_names = transform.get_feature_names() - except AttributeError: + if hasattr(transform, "get_feature_names"): + # doing hassattr instead of a try-except on everything + # b/c catching AttributeError makes recursive code + # impossible to debug + try: + feature_names = transform.get_feature_names( + input_features=feature_names) + except TypeError: + feature_names = transform.get_feature_names() + else: feature_names = None return feature_names diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 86cc07a9caf89..b39242fcc77e3 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1150,5 +1150,6 @@ def test_input_features_meta(): assert_array_equal(inner_pipe.named_steps.clf.input_features_, xs[mask]) pipe.get_feature_names(iris.feature_names) assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(inner_pipe.input_features_, iris.feature_names) assert_array_equal(inner_pipe.named_steps.clf.input_features_, np.array(iris.feature_names)[mask]) \ No newline at end of file From eb78eac0d5aceaa59acf8c2b97ecb627e2fe7f04 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 14:21:02 +0100 Subject: [PATCH 24/54] really ugly stuff to make the last 1% usecase work --- sklearn/base.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/sklearn/base.py b/sklearn/base.py index 682bc75c9d5cd..e33c933e1d3f1 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -6,11 +6,15 @@ import copy import warnings from collections import defaultdict +from collections.abc import Iterable + import inspect import numpy as np from . import __version__ +from .exceptions import NotFittedError + from sklearn.utils import _IS_32BIT _DEFAULT_TAGS = { @@ -573,10 +577,40 @@ def get_feature_names(self, input_features=None): " input feature names for {}".format(self)) +def _get_sub_estimators(est, fitted_only=True): + attrs = [getattr(est, x, None) for x in dir(est) if not x.startswith("_")] + + def _recurse_sub_ests(candidates): + sub_ests = [] + for a in candidates: + if hasattr(a, "set_params") and hasattr(a, "fit"): + sub_ests.append(a) + elif isinstance(a, Iterable) and not isinstance(a, str): + sub_ests.extend(_recurse_sub_ests(a)) + return sub_ests + return list(set(_recurse_sub_ests(attrs))) + + class MetaEstimatorMixin: _required_parameters = ["estimator"] """Mixin class for all meta estimators in scikit-learn.""" + def get_feature_names(self, input_features=None): + sub_ests = _get_sub_estimators(self) + for est in sub_ests: + if hasattr(est, "get_feature_names"): + # doing hassattr instead of a try-except on everything + # b/c catching AttributeError makes recursive code + # impossible to debug + try: + est.get_feature_names(input_features=input_features) + except TypeError: + # do we need this? + est.get_feature_names() + except NotFittedError: + pass + print("done recursing") + class MultiOutputMixin(object): """Mixin to mark estimators that support multioutput.""" From d373b87e6df01248edc028c013c2372f908a18fa Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 14:24:49 +0100 Subject: [PATCH 25/54] barh instead of bar in example --- examples/feature_selection/plot_feature_selection_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py index 5b470af376535..466501b005b46 100644 --- a/examples/feature_selection/plot_feature_selection_pipeline.py +++ b/examples/feature_selection/plot_feature_selection_pipeline.py @@ -35,6 +35,6 @@ print(classification_report(y_test, y_pred)) # access and plot the coefficients of the fitted model -plt.bar((0, 1, 2), anova_svm.named_steps.svc.coef_.ravel()) -plt.xticks((0, 1, 2), anova_svm.named_steps.svc.input_features_) +plt.barh((0, 1, 2), anova_svm.named_steps.svc.coef_.ravel()) +plt.yticks((0, 1, 2), anova_svm.named_steps.svc.input_features_) plt.show() From 4d4e6c6cf3b1b82cdb7dc9542768efebf12bcb9c Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 15:20:25 +0100 Subject: [PATCH 26/54] test "simple" nested meta-estimator --- sklearn/base.py | 2 +- sklearn/tests/test_pipeline.py | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 951622a822470..f07a49efd1fa3 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -599,6 +599,7 @@ class MetaEstimatorMixin: def get_feature_names(self, input_features=None): sub_ests = _get_sub_estimators(self) for est in sub_ests: + est.input_features_ = input_features if hasattr(est, "get_feature_names"): # doing hassattr instead of a try-except on everything # b/c catching AttributeError makes recursive code @@ -610,7 +611,6 @@ def get_feature_names(self, input_features=None): est.get_feature_names() except NotFittedError: pass - print("done recursing") class MultiOutputMixin(object): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index b39242fcc77e3..9d3bd701c438e 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1136,7 +1136,7 @@ def test_input_features_nested(): @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 -def test_input_features_meta(): +def test_input_features_meta_pipe(): ovr = OneVsRestClassifier(Pipeline(steps=[('select', SelectKBest(k=2)), ('clf', LogisticRegression())])) pipe = Pipeline(steps=[('ovr', ovr)]) @@ -1152,4 +1152,22 @@ def test_input_features_meta(): assert_array_equal(pipe.input_features_, iris.feature_names) assert_array_equal(inner_pipe.input_features_, iris.feature_names) assert_array_equal(inner_pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) + + +@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 +def test_input_features_meta(): + ovr = OneVsRestClassifier(LogisticRegression()) + pipe = Pipeline(steps=[('select', SelectKBest(k=2)), ('ovr', ovr)]) + iris = load_iris() + pipe.fit(iris.data, iris.target) + xs = np.array(['x0', 'x1', 'x2', 'x3']) + assert_array_equal(pipe.input_features_, xs) + # check 0ths estimator in OVR only + one_logreg = pipe.named_steps.ovr.estimators_[0] + mask = pipe.named_steps.select.get_support() + assert_array_equal(one_logreg.input_features_, xs[mask]) + pipe.get_feature_names(iris.feature_names) + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(one_logreg.input_features_, np.array(iris.feature_names)[mask]) \ No newline at end of file From 003fcf3b6d89f4bc94d9517459a944c81de4ea2f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 15:25:02 +0100 Subject: [PATCH 27/54] allow None in pipelines get_feature_names, don't overwrite --- sklearn/pipeline.py | 9 ++++++--- sklearn/tests/test_pipeline.py | 7 ++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 48a4baa33c307..0a9d239604d1f 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -546,7 +546,7 @@ def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) - def get_feature_names(self, input_features): + def get_feature_names(self, input_features=None): """Set the input feature names for all steps. Sets the input_features_ attribute on the pipeline and @@ -563,8 +563,11 @@ def get_feature_names(self, input_features): of the pipeline. """ - self.input_features_ = input_features - feature_names = input_features + if input_features is not None: + self.input_features_ = input_features + if self.input_features_ is None: + raise ValueError("No feature names provided and none stored.") + feature_names = self.input_features_ for _, name, transform in self._iter(with_final=True): transform.input_features_ = feature_names if hasattr(transform, "get_feature_names"): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 9d3bd701c438e..4691fda7a7c4a 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1071,7 +1071,12 @@ def test_set_input_features(): assert_array_equal(pipe.input_features_, iris.feature_names) assert_array_equal(pipe.named_steps.clf.input_features_, np.array(iris.feature_names)[mask]) - + # check that empty get_feature_names() doesn't overwrite + res = pipe.get_feature_names() + assert res is None + assert_array_equal(pipe.input_features_, iris.feature_names) + assert_array_equal(pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 def test_input_feature_names_pandas(): From f185af3c49c2c90bc8a3a0f5efcd37283fe7e792 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 15:29:05 +0100 Subject: [PATCH 28/54] nicer error on not fitted pipeline --- sklearn/pipeline.py | 5 +++-- sklearn/tests/test_pipeline.py | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 0a9d239604d1f..dbff87f5bd5e2 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -16,6 +16,7 @@ from scipy import sparse from .base import clone, TransformerMixin +from .exceptions import NotFittedError from .utils._joblib import Parallel, delayed from .utils.metaestimators import if_delegate_has_method from .utils import Bunch @@ -565,8 +566,8 @@ def get_feature_names(self, input_features=None): """ if input_features is not None: self.input_features_ = input_features - if self.input_features_ is None: - raise ValueError("No feature names provided and none stored.") + if getattr(self, 'input_features_', None) is None: + raise NotFittedError("Estimator Pipeline not fitted.") feature_names = self.input_features_ for _, name, transform in self._iter(with_final=True): transform.input_features_ = feature_names diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 4691fda7a7c4a..eecf1ea49beae 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -20,6 +20,7 @@ from sklearn.utils.testing import assert_no_warnings from sklearn.base import clone, BaseEstimator +from sklearn.exceptions import NotFittedError from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression, Lasso @@ -1059,6 +1060,7 @@ def test_set_input_features(): ('scaler', StandardScaler()), ('select', SelectKBest(k=2)), ('clf', LogisticRegression())]) + assert_raises(NotFittedError, pipe.get_feature_names) iris = load_iris() pipe.fit(iris.data, iris.target) xs = np.array(['x0', 'x1', 'x2', 'x3']) From acc4c76490fad08c672760b1718d5d3d37c768cc Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 15:31:37 +0100 Subject: [PATCH 29/54] flake8 --- sklearn/tests/test_pipeline.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index eecf1ea49beae..7f007b4640a5a 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1080,6 +1080,7 @@ def test_set_input_features(): assert_array_equal(pipe.named_steps.clf.input_features_, np.array(iris.feature_names)[mask]) + @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 def test_input_feature_names_pandas(): pd = pytest.importorskip("pandas") @@ -1125,6 +1126,7 @@ def test_input_features_count_vectorizer(): assert_array_equal(pipe.named_steps.clf.input_features_, ['beer', 'burger', 'coke', 'copyright', 'pizza', 'the']) + @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 def test_input_features_nested(): pipe = Pipeline(steps=[ @@ -1135,13 +1137,15 @@ def test_input_features_nested(): xs = np.array(['x0', 'x1', 'x2', 'x3']) assert_array_equal(pipe.input_features_, xs) mask = pipe.named_steps.inner_pipe.named_steps.select.get_support() - assert_array_equal(pipe.named_steps.inner_pipe.named_steps.clf.input_features_, xs[mask]) + assert_array_equal( + pipe.named_steps.inner_pipe.named_steps.clf.input_features_, xs[mask]) pipe.get_feature_names(iris.feature_names) assert_array_equal(pipe.input_features_, iris.feature_names) - assert_array_equal(pipe.named_steps.inner_pipe.named_steps.clf.input_features_, - np.array(iris.feature_names)[mask]) + assert_array_equal( + pipe.named_steps.inner_pipe.named_steps.clf.input_features_, + np.array(iris.feature_names)[mask]) + - @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 def test_input_features_meta_pipe(): ovr = OneVsRestClassifier(Pipeline(steps=[('select', SelectKBest(k=2)), @@ -1160,7 +1164,7 @@ def test_input_features_meta_pipe(): assert_array_equal(inner_pipe.input_features_, iris.feature_names) assert_array_equal(inner_pipe.named_steps.clf.input_features_, np.array(iris.feature_names)[mask]) - + @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 def test_input_features_meta(): @@ -1177,4 +1181,4 @@ def test_input_features_meta(): pipe.get_feature_names(iris.feature_names) assert_array_equal(pipe.input_features_, iris.feature_names) assert_array_equal(one_logreg.input_features_, - np.array(iris.feature_names)[mask]) \ No newline at end of file + np.array(iris.feature_names)[mask]) From eef87b638b74b7eac83aa01dcabb7824d046c75a Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 15:52:44 +0100 Subject: [PATCH 30/54] better error message, allow call to get_feature_names with None again whoops --- sklearn/impute.py | 1 + sklearn/pipeline.py | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index a201db198c384..210ec67741b2a 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -436,6 +436,7 @@ def get_feature_names(self, input_features=None): feature_names : array-like of string Transformed feature names """ + check_is_fitted(self, 'statistics_') if input_features is None: input_features = ['x%d' % i for i in range(self.statistics_.shape[0])] diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index dbff87f5bd5e2..02fc41e2b5c09 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -16,7 +16,6 @@ from scipy import sparse from .base import clone, TransformerMixin -from .exceptions import NotFittedError from .utils._joblib import Parallel, delayed from .utils.metaestimators import if_delegate_has_method from .utils import Bunch @@ -564,10 +563,9 @@ def get_feature_names(self, input_features=None): of the pipeline. """ - if input_features is not None: + if input_features is not None or not hasattr(self, 'input_features_'): self.input_features_ = input_features - if getattr(self, 'input_features_', None) is None: - raise NotFittedError("Estimator Pipeline not fitted.") + feature_names = self.input_features_ for _, name, transform in self._iter(with_final=True): transform.input_features_ = feature_names From 4ed56c81a20c4c182fe39ec934b0ff0d2ec93ff2 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 17:10:31 +0100 Subject: [PATCH 31/54] replace too-smart solution with explicit simple solution for meta-estimators --- sklearn/base.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index f07a49efd1fa3..9754f39b67ab4 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -6,7 +6,6 @@ import copy import warnings from collections import defaultdict -from collections.abc import Iterable import platform import inspect @@ -578,18 +577,14 @@ def get_feature_names(self, input_features=None): " input feature names for {}".format(self)) -def _get_sub_estimators(est, fitted_only=True): - attrs = [getattr(est, x, None) for x in dir(est) if not x.startswith("_")] - - def _recurse_sub_ests(candidates): - sub_ests = [] - for a in candidates: - if hasattr(a, "set_params") and hasattr(a, "fit"): - sub_ests.append(a) - elif isinstance(a, Iterable) and not isinstance(a, str): - sub_ests.extend(_recurse_sub_ests(a)) - return sub_ests - return list(set(_recurse_sub_ests(attrs))) +def _get_sub_estimators(est): + # Explicitly declare all fitted subestimators of existing meta-estimators + if hasattr(est, "estimator_"): + return [est.estimator_] + if hasattr(est, "base_estimator_"): + return [est.base_estimator_] + if hasattr(est, "estimators_"): + return est.estimators_ class MetaEstimatorMixin: From 8787e0454b19f2922130b82813e975c50b9febb1 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 17:10:48 +0100 Subject: [PATCH 32/54] convert feature names from pandas to numpy array --- sklearn/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 02fc41e2b5c09..06c5edcfd58a0 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -28,7 +28,7 @@ def _get_feature_names(X): if hasattr(X, 'columns'): - feature_names = X.columns + feature_names = np.array(X.columns) elif getattr(X, 'ndim', 0) > 1: feature_names = getattr(X, 'columns', ['x%d' % i for i in range(X.shape[1])]) From c0575996b180b85f6400a98c9631a36516808bb8 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 17:16:23 +0100 Subject: [PATCH 33/54] Fix get_feature_name docstrings --- sklearn/base.py | 7 +++++++ sklearn/feature_selection/base.py | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/sklearn/base.py b/sklearn/base.py index 9754f39b67ab4..32d88d3de80c4 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -592,6 +592,13 @@ class MetaEstimatorMixin: """Mixin class for all meta estimators in scikit-learn.""" def get_feature_names(self, input_features=None): + """Ensure feature names are set on sub-estimators + + Parameters + ---------- + input_features : list of string or None + Input features to the meta-estimator. + """ sub_ests = _get_sub_estimators(self) for est in sub_ests: est.input_features_ = input_features diff --git a/sklearn/feature_selection/base.py b/sklearn/feature_selection/base.py index e03102989b6a0..b644c2f09673a 100644 --- a/sklearn/feature_selection/base.py +++ b/sklearn/feature_selection/base.py @@ -121,6 +121,14 @@ def inverse_transform(self, X): return Xt def get_feature_names(self, input_features=None): + """Mask feature names according to selected features. + + Parameters + ---------- + input_features : list of string or None + Input features to select from. If none, they are generated as + x0, x1, ..., xn. + """ mask = self.get_support() if input_features is None: input_features = ['x%d' % i From fca9ac292946b73f2463e304630e068de243adf8 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 18:45:14 +0100 Subject: [PATCH 34/54] fix pipeline get_feature_names docstring --- sklearn/pipeline.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 06c5edcfd58a0..7f5efba3aa2c9 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -547,11 +547,12 @@ def _pairwise(self): return getattr(self.steps[0][1], '_pairwise', False) def get_feature_names(self, input_features=None): - """Set the input feature names for all steps. + """Get the feature names for all steps. Sets the input_features_ attribute on the pipeline and on all pipeline steps using the provided input feature names - as input for the first step. + as input for the first step, and returns the output features + if the last step is a transformer. Some estimators like `ColumnTransformer` and `CountVectorizer` might ignore the provided input feature names. @@ -562,6 +563,12 @@ def get_feature_names(self, input_features=None): Feature names to use as input feature names for the first step of the pipeline. + Returns + ------- + feature_names : array-like of string or None + Output feature names of the last step if it is a transformer, + and None otherwise. + """ if input_features is not None or not hasattr(self, 'input_features_'): self.input_features_ = input_features @@ -582,24 +589,6 @@ def get_feature_names(self, input_features=None): feature_names = None return feature_names - # def get_feature_names(self, input_features=None): - """Get feature names for transformation. - - Transform input features using the pipeline. - If the last step is a transformer, it's included - in the transformation, otherwise it's not. - - Parameters - ---------- - input_features : array-like of string - Input feature names. - - Returns - ------- - feature_names : array-like of string - Transformed feature names - """ - def _name_estimators(estimators): """Generate names for estimators.""" From d660f9284aafa2e54fa7cc6fd1b15b0cb30f3b47 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 19:13:22 +0100 Subject: [PATCH 35/54] minor fix for meta-estimators with array estimators --- sklearn/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/base.py b/sklearn/base.py index 32d88d3de80c4..b31ab792eac16 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -600,6 +600,9 @@ def get_feature_names(self, input_features=None): Input features to the meta-estimator. """ sub_ests = _get_sub_estimators(self) + if hasattr(sub_ests, 'shape'): + # Gradient boosting has a 2d array of estimators + sub_ests = sub_ests.ravel() for est in sub_ests: est.input_features_ = input_features if hasattr(est, "get_feature_names"): From a74f4c4b1df93bc00949d994079ca6fad37a5e44 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 28 Feb 2019 19:55:58 +0100 Subject: [PATCH 36/54] ignore more deprecation warnings from logistic --- sklearn/tests/test_pipeline.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 7f007b4640a5a..59cf32b430588 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1054,6 +1054,7 @@ def test_make_pipeline_memory(): @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 +@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 def test_set_input_features(): pipe = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), @@ -1082,6 +1083,7 @@ def test_set_input_features(): @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 +@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 def test_input_feature_names_pandas(): pd = pytest.importorskip("pandas") pipe = Pipeline(steps=[ @@ -1098,6 +1100,7 @@ def test_input_feature_names_pandas(): @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 +@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 def test_input_features_passthrough(): pipe = Pipeline(steps=[ ('imputer', 'passthrough'), @@ -1114,6 +1117,7 @@ def test_input_features_passthrough(): @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 +@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 def test_input_features_count_vectorizer(): pipe = Pipeline(steps=[ ('vect', CountVectorizer()), @@ -1128,6 +1132,7 @@ def test_input_features_count_vectorizer(): @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 +@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 def test_input_features_nested(): pipe = Pipeline(steps=[ ('inner_pipe', Pipeline(steps=[('select', SelectKBest(k=2)), @@ -1147,6 +1152,7 @@ def test_input_features_nested(): @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 +@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 def test_input_features_meta_pipe(): ovr = OneVsRestClassifier(Pipeline(steps=[('select', SelectKBest(k=2)), ('clf', LogisticRegression())])) @@ -1167,6 +1173,7 @@ def test_input_features_meta_pipe(): @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 +@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 def test_input_features_meta(): ovr = OneVsRestClassifier(LogisticRegression()) pipe = Pipeline(steps=[('select', SelectKBest(k=2)), ('ovr', ovr)]) From eefe54c9aa8515de56ed2827c53c21498bc5a42b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 13:40:20 +0100 Subject: [PATCH 37/54] refinement of _get_sub_estimators, add crazy test --- sklearn/base.py | 20 ++++++----- sklearn/tests/test_base.py | 71 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 8 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index b31ab792eac16..e316e364d23a9 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -579,12 +579,19 @@ def get_feature_names(self, input_features=None): def _get_sub_estimators(est): # Explicitly declare all fitted subestimators of existing meta-estimators - if hasattr(est, "estimator_"): - return [est.estimator_] - if hasattr(est, "base_estimator_"): - return [est.base_estimator_] + sub_ests = [] + # OHE is not really needed + sub_names = ['estimator_', 'base_estimator_', 'one_hot_encoder_'] + for name in sub_names: + sub_est = getattr(est, name, None) + if sub_est is not None: + sub_ests.append(est.estimator_) if hasattr(est, "estimators_"): - return est.estimators_ + if hasattr(est.estimators_, 'shape'): + sub_ests.extend(est.estimators_.ravel()) + else: + sub_ests.extend(est.estimators_) + return sub_ests class MetaEstimatorMixin: @@ -600,9 +607,6 @@ def get_feature_names(self, input_features=None): Input features to the meta-estimator. """ sub_ests = _get_sub_estimators(self) - if hasattr(sub_ests, 'shape'): - # Gradient boosting has a 2d array of estimators - sub_ests = sub_ests.ravel() for est in sub_ests: est.input_features_ = input_features if hasattr(est, "get_feature_names"): diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index cf1f9739d6384..1df0e0dd74621 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -486,3 +486,74 @@ def test_tag_inheritance(): diamond_tag_est = DiamondOverwriteTag() with pytest.raises(TypeError, match="Inconsistent values for tag"): diamond_tag_est._get_tags() + + +def test_sub_estimator_consistency(): + # check that _get_sub_estimators finds all fitted sub estimators + # if this breaks, you probably introduced a sub-estimator that's + # non-standard (not estimator_, base_estimator_ or estimators_) + from sklearn.utils.testing import all_estimators + from sklearn.base import (MetaEstimatorMixin, _get_sub_estimators, + ClassifierMixin, RegressorMixin) + + from sklearn.model_selection._search import BaseSearchCV + from sklearn.feature_selection.base import SelectorMixin + from sklearn.datasets import make_blobs + from sklearn.linear_model import Ridge, LogisticRegression + from sklearn.utils.estimator_checks import \ + multioutput_estimator_convert_y_2d + from collections.abc import Iterable + + def has_fitted_attr(est): + attrs = [(x, getattr(est, x, None)) + for x in dir(est) if x.endswith("_") and not x.startswith("__")] + return len(attrs) + + def get_sub_estimators_brute(est): + # recurse through all attributes to get sub-estimators + attrs = [(x, getattr(est, x, None)) + for x in dir(est) if not x.startswith("_")] + + def _recurse_sub_ests(candidates): + sub_ests = [] + for a in candidates: + if hasattr(a, "set_params") and hasattr(a, "fit"): + sub_ests.append(a) + elif isinstance(a, Iterable) and not isinstance(a, str): + sub_ests.extend(_recurse_sub_ests(a)) + return sub_ests + ests = _recurse_sub_ests(attrs) + # we don't consider label processors child estimators + return set([e for e in ests if has_fitted_attr(e) + and e.__module__ != "sklearn.preprocessing.label"]) + + al = all_estimators() + mets = [x for x in al if issubclass(x[1], MetaEstimatorMixin)] + + X, y = make_blobs() + others = [] + + for name, Est in mets: + # instantiate and fit + try: + est = Est() + except TypeError: + if issubclass(Est, (ClassifierMixin, SelectorMixin)): + est = Est(LogisticRegression(solver='lbfgs', multi_class='auto')) + elif issubclass(Est, RegressorMixin): + est = Est(Ridge()) + else: + others.append((name, Est)) + if est._get_tags()['_skip_test']: + continue + + y = multioutput_estimator_convert_y_2d(est, y) + est.fit(X, y) + # test recursive sub estimators are the same as result of + #_get_sub_estimators which uses a hard-coded list + assert (set(_get_sub_estimators(est)) == + get_sub_estimators_brute(est)) + + for name, Est in others: + # only things we couldn't instantiate are the search CV + assert issubclass(Est, BaseSearchCV) \ No newline at end of file From ad48edf9a848cff184a5d6d3aa8ab0153f87a68d Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 13:41:04 +0100 Subject: [PATCH 38/54] typo / make crazy test pass --- sklearn/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index e316e364d23a9..1545bca3d692c 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -585,7 +585,7 @@ def _get_sub_estimators(est): for name in sub_names: sub_est = getattr(est, name, None) if sub_est is not None: - sub_ests.append(est.estimator_) + sub_ests.append(sub_est) if hasattr(est, "estimators_"): if hasattr(est.estimators_, 'shape'): sub_ests.extend(est.estimators_.ravel()) From 4bbd8cd9ad865563cf1ff9138db697dc4b01fea6 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 14:36:57 +0100 Subject: [PATCH 39/54] add get_feature_names to TransformerMixin, overwrite in random tree embedding --- sklearn/base.py | 19 +++++++++++++++++++ sklearn/ensemble/forest.py | 3 +++ 2 files changed, 22 insertions(+) diff --git a/sklearn/base.py b/sklearn/base.py index 1545bca3d692c..0bfe219bf2c43 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -502,6 +502,25 @@ def fit_transform(self, X, y=None, **fit_params): # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X) + def get_feature_names(self, input_features=None): + # OneToOneMixin is higher in the class hierarchy + # because we put mixins on the wrong side + if hasattr(super(), 'get_feature_names'): + return super().get_feature_names(input_features) + # generate feature names from class name by default + if hasattr(self, 'n_components_'): + # n_components could be auto or None + # this is more likely to be an int + n_features = self.n_components_ + elif hasattr(self, 'n_components') and self.n_components is not None: + n_features = self.n_components + elif hasattr(self, 'components_'): + n_features = self.components_.shape[0] + else: + return None + return ["{}{}".format(str(type(self)).lower(), i) + for i in range(n_features)] + class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index aae9dd8c72349..2345366feb89d 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -2025,3 +2025,6 @@ def transform(self, X): """ check_is_fitted(self, 'one_hot_encoder_') return self.one_hot_encoder_.transform(self.apply(X)) + + def get_feature_names(self, input_features=None): + return None \ No newline at end of file From eb9aa528c82dc3cfd173bbdde8f24b0deb3eb5a9 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 14:53:51 +0100 Subject: [PATCH 40/54] fix docstrings --- sklearn/base.py | 12 ++++++++++++ sklearn/ensemble/forest.py | 6 ++++++ 2 files changed, 18 insertions(+) diff --git a/sklearn/base.py b/sklearn/base.py index 0bfe219bf2c43..8bdfdcca5d3bc 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -503,6 +503,18 @@ def fit_transform(self, X, y=None, **fit_params): return self.fit(X, y, **fit_params).transform(X) def get_feature_names(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : list of string or None + String names of the input features. + + Returns + ------- + output_feature_names : list of string + Feature names for transformer output. + """ # OneToOneMixin is higher in the class hierarchy # because we put mixins on the wrong side if hasattr(super(), 'get_feature_names'): diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 2345366feb89d..97fccce4913ae 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -2027,4 +2027,10 @@ def transform(self, X): return self.one_hot_encoder_.transform(self.apply(X)) def get_feature_names(self, input_features=None): + """Feature names - not implemented yet. + + Parameters + ---------- + input_features : list of strings or None + """ return None \ No newline at end of file From fe4a02070dc8e7d526bd0c3a8b0cc787e2d8df5a Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 14:59:52 +0100 Subject: [PATCH 41/54] add "init_" and "best_estimator_" to list of sub estimators --- sklearn/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index 8bdfdcca5d3bc..e76a4db10bb1c 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -612,7 +612,8 @@ def _get_sub_estimators(est): # Explicitly declare all fitted subestimators of existing meta-estimators sub_ests = [] # OHE is not really needed - sub_names = ['estimator_', 'base_estimator_', 'one_hot_encoder_'] + sub_names = ['estimator_', 'base_estimator_', 'one_hot_encoder_', + 'best_estimator_', 'init_'] for name in sub_names: sub_est = getattr(est, name, None) if sub_est is not None: From 750906bdc5282610d78e93621a4fe4f7eb131d16 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 15:03:36 +0100 Subject: [PATCH 42/54] pep8 --- sklearn/base.py | 6 +++--- sklearn/tests/test_base.py | 16 +++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index e76a4db10bb1c..eec1a1d9c4218 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -504,11 +504,11 @@ def fit_transform(self, X, y=None, **fit_params): def get_feature_names(self, input_features=None): """Get output feature names. - + Parameters ---------- input_features : list of string or None - String names of the input features. + String names of the input features. Returns ------- @@ -621,7 +621,7 @@ def _get_sub_estimators(est): if hasattr(est, "estimators_"): if hasattr(est.estimators_, 'shape'): sub_ests.extend(est.estimators_.ravel()) - else: + else: sub_ests.extend(est.estimators_) return sub_ests diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 1df0e0dd74621..1ef88436113ef 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -495,7 +495,7 @@ def test_sub_estimator_consistency(): from sklearn.utils.testing import all_estimators from sklearn.base import (MetaEstimatorMixin, _get_sub_estimators, ClassifierMixin, RegressorMixin) - + from sklearn.model_selection._search import BaseSearchCV from sklearn.feature_selection.base import SelectorMixin from sklearn.datasets import make_blobs @@ -503,10 +503,11 @@ def test_sub_estimator_consistency(): from sklearn.utils.estimator_checks import \ multioutput_estimator_convert_y_2d from collections.abc import Iterable - + def has_fitted_attr(est): attrs = [(x, getattr(est, x, None)) - for x in dir(est) if x.endswith("_") and not x.startswith("__")] + for x in dir(est) if x.endswith("_") + and not x.startswith("__")] return len(attrs) def get_sub_estimators_brute(est): @@ -529,7 +530,7 @@ def _recurse_sub_ests(candidates): al = all_estimators() mets = [x for x in al if issubclass(x[1], MetaEstimatorMixin)] - + X, y = make_blobs() others = [] @@ -539,7 +540,8 @@ def _recurse_sub_ests(candidates): est = Est() except TypeError: if issubclass(Est, (ClassifierMixin, SelectorMixin)): - est = Est(LogisticRegression(solver='lbfgs', multi_class='auto')) + est = Est(LogisticRegression(solver='lbfgs', + multi_class='auto')) elif issubclass(Est, RegressorMixin): est = Est(Ridge()) else: @@ -550,10 +552,10 @@ def _recurse_sub_ests(candidates): y = multioutput_estimator_convert_y_2d(est, y) est.fit(X, y) # test recursive sub estimators are the same as result of - #_get_sub_estimators which uses a hard-coded list + # _get_sub_estimators which uses a hard-coded list assert (set(_get_sub_estimators(est)) == get_sub_estimators_brute(est)) for name, Est in others: # only things we couldn't instantiate are the search CV - assert issubclass(Est, BaseSearchCV) \ No newline at end of file + assert issubclass(Est, BaseSearchCV) From 0ca6e9d1dc90949e2c04aab0f4d445306aba5f65 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 15:13:40 +0100 Subject: [PATCH 43/54] fix class name formatting, add test for pca feature names in pipeline --- sklearn/base.py | 2 +- sklearn/tests/test_pipeline.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index eec1a1d9c4218..865af6a7cc1a9 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -530,7 +530,7 @@ def get_feature_names(self, input_features=None): n_features = self.components_.shape[0] else: return None - return ["{}{}".format(str(type(self)).lower(), i) + return ["{}{}".format(type(self).__name__.lower(), i) for i in range(n_features)] diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 59cf32b430588..65660332a3976 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1080,6 +1080,18 @@ def test_set_input_features(): assert_array_equal(pipe.input_features_, iris.feature_names) assert_array_equal(pipe.named_steps.clf.input_features_, np.array(iris.feature_names)[mask]) + pipe = Pipeline(steps=[ + ('scaler', StandardScaler()), + ('pca', PCA(n_components=3)), + ('select', SelectKBest(k=2)), + ('clf', LogisticRegression())]) + pipe.fit(iris.data, iris.target) + assert_array_equal(pipe.named_steps.clf.input_features_, ['pca0', 'pca1']) + # setting names doesn't change names after PCA + pipe.get_feature_names(iris.feature_names) + assert_array_equal(pipe.named_steps.select.input_features_, + ['pca0', 'pca1', 'pca2']) + @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 From 7cd3dd0b4153bb4f5e1b07cdfa8cfff635bd90b8 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 15:40:53 +0100 Subject: [PATCH 44/54] ignore warnings from changing init parameters --- sklearn/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 1ef88436113ef..80953df843bbb 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -487,7 +487,7 @@ def test_tag_inheritance(): with pytest.raises(TypeError, match="Inconsistent values for tag"): diamond_tag_est._get_tags() - +@ignore_warnings(category=(FutureWarning, DeprecationWarning)) def test_sub_estimator_consistency(): # check that _get_sub_estimators finds all fitted sub estimators # if this breaks, you probably introduced a sub-estimator that's From a85ab5e0af8f41a2387c67f4cac22ca37629e50c Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 15:43:22 +0100 Subject: [PATCH 45/54] common test for feature name length --- sklearn/base.py | 13 ++++++++++++- sklearn/utils/estimator_checks.py | 8 ++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index 865af6a7cc1a9..47855558af565 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -520,7 +520,18 @@ def get_feature_names(self, input_features=None): if hasattr(super(), 'get_feature_names'): return super().get_feature_names(input_features) # generate feature names from class name by default - if hasattr(self, 'n_components_'): + # would be much less guessing if we stored the number + # of output features. + # Ideally this would be done in each class. + if hasattr(self, 'n_clusters'): + # this is before n_components_ + # because n_components_ means something else + # in agglomerative clustering + n_features = self.n_clusters + elif hasattr(self, '_max_components'): + # special case for LinearDiscriminantAnalysis + n_features = min(self._max_components, self.n_components) + elif hasattr(self, 'n_components_'): # n_components could be auto or None # this is more likely to be an int n_features = self.n_components_ diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 433fe8fabd6f9..f97ba0e9c97f0 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -991,6 +991,14 @@ def _check_transformer(name, transformer_orig, X, y): transformer_clone = clone(transformer) X_pred = transformer_clone.fit_transform(X, y=y_) + input_features = ['feature%d' % i for i in range(n_features)] + feature_names = transformer_clone.get_feature_names(input_features) + if feature_names is not None: + if isinstance(X_pred, tuple): + assert len(feature_names) == X_pred[0].shape[1] + else: + assert len(feature_names) == X_pred.shape[1] + if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) From 8001cdbabbedc569de0a607e33f8cb93c0889066 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 15:51:30 +0100 Subject: [PATCH 46/54] renamed one hot encoder for more intuitive feature names --- doc/modules/compose.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index a7cde459aea1e..152ae182e37a1 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -447,7 +447,7 @@ By default, the remaining rating columns are ignored (``remainder='drop'``):: >>> from sklearn.feature_extraction.text import CountVectorizer >>> from sklearn.preprocessing import OneHotEncoder >>> column_trans = ColumnTransformer( - ... [('city_category', OneHotEncoder(dtype='int'),['city']), + ... [('categories', OneHotEncoder(dtype='int'),['city']), ... ('title_bow', CountVectorizer(), 'title')], ... remainder='drop') @@ -458,11 +458,11 @@ By default, the remaining rating columns are ignored (``remainder='drop'``):: >>> column_trans.get_feature_names() ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS - ['city_category__city_London', 'city_category__city_Paris', 'city_category__city_Sallisaw', - 'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his', - 'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable', - 'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson', - 'title_bow__wrath'] + ['categories__city_London', 'categories__city_Paris', + 'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast', + 'title_bow__grapes', 'title_bow__his', 'title_bow__how', 'title_bow__last', + 'title_bow__learned', 'title_bow__moveable', 'title_bow__of', 'title_bow__the', + 'title_bow__trick', 'title_bow__watson', 'title_bow__wrath'] >>> column_trans.transform(X).toarray() ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS From fa00af0d6f02dda6cab3f4006d8c7e37334e9f01 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 16:59:59 +0100 Subject: [PATCH 47/54] LDA Special case fixes --- sklearn/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index 47855558af565..f5bf59bb8d58e 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -530,7 +530,8 @@ def get_feature_names(self, input_features=None): n_features = self.n_clusters elif hasattr(self, '_max_components'): # special case for LinearDiscriminantAnalysis - n_features = min(self._max_components, self.n_components) + n_components = self.n_components or np.inf + n_features = min(self._max_components, n_components) elif hasattr(self, 'n_components_'): # n_components could be auto or None # this is more likely to be an int From ccfc971cca047e3fb9a9543ba1997482d3a3883e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Mar 2019 17:51:41 +0100 Subject: [PATCH 48/54] only check feature names if they exist to be nice to contrib estimators --- sklearn/utils/estimator_checks.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index f97ba0e9c97f0..fee446adc7420 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -992,12 +992,13 @@ def _check_transformer(name, transformer_orig, X, y): X_pred = transformer_clone.fit_transform(X, y=y_) input_features = ['feature%d' % i for i in range(n_features)] - feature_names = transformer_clone.get_feature_names(input_features) - if feature_names is not None: - if isinstance(X_pred, tuple): - assert len(feature_names) == X_pred[0].shape[1] - else: - assert len(feature_names) == X_pred.shape[1] + if hasattr(transformer_clone, 'get_feature_names'): + feature_names = transformer_clone.get_feature_names(input_features) + if feature_names is not None: + if isinstance(X_pred, tuple): + assert len(feature_names) == X_pred[0].shape[1] + else: + assert len(feature_names) == X_pred.shape[1] if isinstance(X_pred, tuple): for x_pred in X_pred: From 2dae33925bad9328ba5a621fe0dfff7d357b4e7e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 2 Mar 2019 00:16:30 +0100 Subject: [PATCH 49/54] hackety hack --- sklearn/compose/tests/test_column_transformer.py | 13 +++++++++++++ sklearn/pipeline.py | 8 +++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index a8a1cbea8e524..05ebf4a216a3a 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -19,6 +19,7 @@ from sklearn.exceptions import NotFittedError from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder from sklearn.feature_extraction import DictVectorizer +from sklearn.pipeline import make_pipeline class Trans(BaseEstimator): @@ -658,6 +659,18 @@ def test_column_transformer_get_feature_names(): assert_raise_message(AttributeError, "Transformer trans (type Trans) does not provide " "get_feature_names", ct.get_feature_names) + + # if some transformers support and some don't + ct = ColumnTransformer([('trans', Trans(), [0, 1]), + ('scale', StandardScaler(), [0])]) + ct.fit(X_array) + assert_raise_message(AttributeError, + "Transformer trans (type Trans) does not provide " + "get_feature_names", ct.get_feature_names) + + # inside a pipeline + make_pipeline(ct).fit(X_array) + # working example X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}], diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 7f5efba3aa2c9..04e35ebd299a7 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -576,16 +576,14 @@ def get_feature_names(self, input_features=None): feature_names = self.input_features_ for _, name, transform in self._iter(with_final=True): transform.input_features_ = feature_names - if hasattr(transform, "get_feature_names"): - # doing hassattr instead of a try-except on everything - # b/c catching AttributeError makes recursive code - # impossible to debug + try: try: feature_names = transform.get_feature_names( input_features=feature_names) except TypeError: feature_names = transform.get_feature_names() - else: + except AttributeError: + # this can come from inside a meta-estimator feature_names = None return feature_names From 534c4eddd65abae91cb5624276d0044100e447d8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 6 Mar 2019 17:36:24 +0100 Subject: [PATCH 50/54] Better titanic interpretation --- .../plot_column_transformer_mixed_types.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index ef000c4a0077e..405661ff73f22 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -86,8 +86,8 @@ # The coefficients of the final classification step of the pipeline gives an # idea how each feature impacts the likelihood of survival assuming that the # usual linear model assumptions hold (uncorrelated features, linear -# separability, homoschedastic and normally distributed errors...) which we do -# not verify in this example. +# separability, homoschedastic errors...) which we do not verify in this +# example. # # To get error bars we perform cross-validation and compute the mean and # standard deviation for each coefficient accross CV splits. Because we use a @@ -99,13 +99,21 @@ # # We can see that the linear model coefficients are in agreement with the # historical reports: people in higher classes and therefore in the upper decks -# were first to access the lifeboats, and often, priority was given to women +# were the first to reach the lifeboats, and often, priority was given to women # and children. +# +# Note that conditionned on the "pclass_x" one-hot features, the "fare" +# numerical feature does not seem to be significantly predictive. If we drop +# the "pclass" feature, then higher "fare" values would appear significantly +# correlated with a higher likelihood of survival as the "fare" and "pclass" +# features have a strong statistical dependency. import matplotlib.pyplot as plt from sklearn.model_selection import cross_validate +from sklearn.model_selection import StratifiedShuffleSplit -cv_results = cross_validate(pipeline, X_train, y_train, cv=10, +cv = StratifiedShuffleSplit(n_splits=20, test_size=0.25, random_state=42) +cv_results = cross_validate(pipeline, X_train, y_train, cv=cv, return_estimator=True) cv_coefs = np.concatenate([cv_pipeline.named_steps["classifier"].coef_ for cv_pipeline in cv_results["estimator"]]) From dc1c349cc9e21051914aa46848ad2c58e8de6101 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 7 Mar 2019 17:46:35 +0100 Subject: [PATCH 51/54] Phrasing in example --- examples/compose/plot_column_transformer_mixed_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 405661ff73f22..71551856ac35b 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -81,7 +81,7 @@ ############################################################################### -# Introspecting the coefficients values of the classifier +# Inspecting the coefficients values of the classifier ############################################################################### # The coefficients of the final classification step of the pipeline gives an # idea how each feature impacts the likelihood of survival assuming that the From 089c65dc4d384b4cf4425c334e485d7184a47bac Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Thu, 7 Mar 2019 13:47:19 -0500 Subject: [PATCH 52/54] Apply suggestions from code review minor doc fixes by adrin Co-Authored-By: amueller --- doc/modules/compose.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 152ae182e37a1..4d47737889330 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -116,7 +116,7 @@ ignored by setting them to ``'passthrough'``:: >>> grid_search = GridSearchCV(pipe, param_grid=param_grid) -To enable model inspection, `Pipeline` sets a ``input_features_`` attribute on +To enable model inspection, `Pipeline` sets an ``input_features_`` attribute on all pipeline steps during fitting. This allows the user to understand how features are transformed during a pipeline:: @@ -134,7 +134,7 @@ features are transformed during a pipeline:: array(['x2', 'x3'], dtype='>> pipe.get_feature_names(iris.feature_names) >>> pipe.named_steps.select.input_features_ From 4c17e96e33e043be248258300d5819a418e07377 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 31 May 2019 11:04:09 -0400 Subject: [PATCH 53/54] fix merge issue --- sklearn/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 072e452dea9d6..97ddde7333641 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -628,7 +628,7 @@ def get_feature_names(self, input_features=None): """ feature_names = input_features with_final = hasattr(self._final_estimator, "transform") - for name, transform in self._iter(with_final=with_final): + for i, name, transform in self._iter(with_final=with_final): if not hasattr(transform, "get_feature_names"): raise TypeError("Transformer {} does provide" " get_feature_names".format(name)) From 2733d20036dd993b07b50bad66f175278427b44d Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 31 May 2019 11:19:39 -0400 Subject: [PATCH 54/54] fix impute feature names after file was moved. merging fun --- sklearn/impute/_base.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 7be9da691ce11..6b63c4529c06a 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -273,7 +273,8 @@ def fit(self, X, y=None): self.indicator_.fit(X) else: self.indicator_ = None - + invalid_mask = _get_mask(self.statistics_, np.nan) + self._valid_mask = np.logical_not(invalid_mask) return self def _sparse_fit(self, X, strategy, missing_values, fill_value): @@ -433,6 +434,25 @@ def transform(self, X): def _more_tags(self): return {'allow_nan': True} + def get_feature_names(self, input_features=None): + """Get feature names for transformation. + + Parameters + ---------- + input_features : array-like of string + Input feature names. + + Returns + ------- + feature_names : array-like of string + Transformed feature names + """ + check_is_fitted(self, 'statistics_') + if input_features is None: + input_features = ['x%d' % i + for i in range(self.statistics_.shape[0])] + return np.array(input_features)[self._valid_mask] + class MissingIndicator(BaseEstimator, TransformerMixin): """Binary indicators for missing values.