diff --git a/doc/data_transforms.rst b/doc/data_transforms.rst index e861762891ecc..744791396cc90 100644 --- a/doc/data_transforms.rst +++ b/doc/data_transforms.rst @@ -33,3 +33,4 @@ scikit-learn. modules/kernel_approximation modules/metrics modules/preprocessing_targets + modules/freeze diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 128f1c85f13e2..cf1fd48b2cb13 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -537,6 +537,24 @@ From text feature_selection.mutual_info_regression +:mod:`sklearn.freeze`: Estimator Freezing +========================================= + +.. automodule:: sklearn.freeze + :no-members: + :no-inherited-members: + +**User guide:** See the :ref:`freeze` section for further details. + +Classes +------- +.. currentmodule:: sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + .. _gaussian_process_ref: :mod:`sklearn.gaussian_process`: Gaussian Processes diff --git a/doc/modules/freeze.rst b/doc/modules/freeze.rst new file mode 100644 index 0000000000000..4683e4b98e7de --- /dev/null +++ b/doc/modules/freeze.rst @@ -0,0 +1,50 @@ +.. _freeze: + +Frozen estimators and transfer learning +======================================= + +.. currentmodule:: sklearn + +It can be useful to pre-fit an estimator before including it in a Pipeline, +FeatureUnion or other meta-estimators. Example applications include: + +* transfer learning: incorporating a transformer trained on a large unlabelled + dataset in a prediction pipeline where the data to be modelled is much smaller +* feature selection on the basis of an already fitted predictive model + +To enable this, your estimator can be wrapped in :class:`freeze.FreezeWrap`. +For example:: + + Without transfer learning + + >>> from sklearn.datasets import load_... + >>> from sklearn.model_selection import cross_val_score + >>> cross_val_score(make_pipeline(TfidfVectorizer(), LogisticRegression()), + ... X, y) + + With transfer learning: + >>> from sklearn.freeze import FreezeWrap + >>> tfidf = TfidfVectorizer().fit(large_X) + >>> cross_val_score(make_pipeline(FreezeWrap(tfidf), LogisticRegression()), + ... X, y) + +In particular, calling ``FrezeWrap(tfidf).fit(X, y)`` now does nothing, +while calling ``FrezeWrap(tfidf).fit_transform(X, y)`` just returns the result of +``tfidf.transform(X)``. + +.. note:: + When an estimator is frozen, calling :func:`clone` on it will return + itself.:: + + >>> from base import clone + >>> frozen = FreezeWrap(tfidf) + >>> clone(frozen) is frozen + True + + This allows the model to be left untouched in cross-validation and + meta-estimators which clear the estimator with ``clone``. + +.. warning:: Leakage: + Please take care to not introduce data leakage by this method: do not + incorporate your test set into the training of some frozen component, + unless it would be realistic to do so in the target application. diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 8a25715498fcd..bdb77609da280 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -132,13 +132,13 @@ def config_context(**new_config): __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition', 'cross_validation', 'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions', 'externals', 'feature_extraction', - 'feature_selection', 'gaussian_process', 'grid_search', - 'isotonic', 'kernel_approximation', 'kernel_ridge', - 'learning_curve', 'linear_model', 'manifold', 'metrics', - 'mixture', 'model_selection', 'multiclass', 'multioutput', - 'naive_bayes', 'neighbors', 'neural_network', 'pipeline', - 'preprocessing', 'random_projection', 'semi_supervised', - 'svm', 'tree', 'discriminant_analysis', + 'feature_selection', 'freeze', 'gaussian_process', + 'grid_search', 'isotonic', 'kernel_approximation', + 'kernel_ridge', 'learning_curve', 'linear_model', 'manifold', + 'metrics', 'mixture', 'model_selection', 'multiclass', + 'multioutput', 'naive_bayes', 'neighbors', 'neural_network', + 'pipeline', 'preprocessing', 'random_projection', + 'semi_supervised', 'svm', 'tree', 'discriminant_analysis', # Non-modules: 'clone'] diff --git a/sklearn/base.py b/sklearn/base.py index aa4f9f9ce17c1..f9559ce7ded66 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -3,7 +3,7 @@ # Author: Gael Varoquaux # License: BSD 3 clause -import copy +from copy import deepcopy import warnings import numpy as np @@ -45,11 +45,14 @@ def clone(estimator, safe=True): """ estimator_type = type(estimator) # XXX: not handling dictionaries + from .freeze import FreezeWrap + if isinstance(estimator, FreezeWrap): + return estimator if estimator_type in (list, tuple, set, frozenset): return estimator_type([clone(e, safe=safe) for e in estimator]) elif not hasattr(estimator, 'get_params'): if not safe: - return copy.deepcopy(estimator) + return deepcopy(estimator) else: raise TypeError("Cannot clone object '%s' (type %s): " "it does not seem to be a scikit-learn estimator " diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 0d2f76cd12239..d66a3e60f453e 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -45,7 +45,8 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin): base_estimator : instance BaseEstimator The classifier whose output decision function needs to be calibrated to offer more accurate predict_proba outputs. If cv=prefit, the - classifier must have been fit already on data. + classifier must have been fit already on data, and it is recommended + that the classifier be frozen (see :ref:`freeze`) in this case. method : 'sigmoid' or 'isotonic' The method to use for calibration. Can be 'sigmoid' which diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index 2502643453d79..f20af8be297a0 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -1,6 +1,8 @@ # Authors: Gilles Louppe, Mathieu Blondel, Maheshakya Wijewardena # License: BSD 3 clause +import warnings + import numpy as np from .base import SelectorMixin @@ -9,6 +11,7 @@ from ..exceptions import NotFittedError from ..utils.metaestimators import if_delegate_has_method +from ..utils.validation import check_is_fitted def _get_feature_importances(estimator, norm_order=1): @@ -86,9 +89,10 @@ class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin): ---------- estimator : object The base estimator from which the transformer is built. - This can be both a fitted (if ``prefit`` is set to True) - or a non-fitted estimator. The estimator must have either a - ``feature_importances_`` or ``coef_`` attribute after fitting. + The estimator must have either a ``feature_importances_`` + or ``coef_`` attribute after fitting. + + Use :class:`freeze.FreezeWrap` if your estimator is already fitted. threshold : string, float, optional default None The threshold value to use for feature selection. Features whose @@ -100,14 +104,6 @@ class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin): or implicitly (e.g, Lasso), the threshold used is 1e-5. Otherwise, "mean" is used by default. - prefit : bool, default False - Whether a prefit model is expected to be passed into the constructor - directly or not. If True, ``transform`` must be called directly - and SelectFromModel cannot be used with ``cross_val_score``, - ``GridSearchCV`` and similar utilities that clone the estimator. - Otherwise train the model using ``fit`` and then ``transform`` to do - feature selection. - norm_order : non-zero int, inf, -inf, default 1 Order of the norm used to filter the vectors of coefficients below ``threshold`` in the case where the ``coef_`` attribute of the @@ -117,28 +113,22 @@ class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin): ---------- estimator_ : an estimator The base estimator from which the transformer is built. - This is stored only when a non-fitted estimator is passed to the - ``SelectFromModel``, i.e when prefit is False. threshold_ : float The threshold value used for feature selection. """ - def __init__(self, estimator, threshold=None, prefit=False, norm_order=1): + def __init__(self, estimator, threshold=None, prefit=None, norm_order=1): self.estimator = estimator self.threshold = threshold self.prefit = prefit self.norm_order = norm_order def _get_support_mask(self): - # SelectFromModel can directly call on transform. if self.prefit: estimator = self.estimator - elif hasattr(self, 'estimator_'): - estimator = self.estimator_ else: - raise ValueError( - 'Either fit SelectFromModel before transform or set "prefit=' - 'True" and pass a fitted estimator to the constructor.') + check_is_fitted(self, 'estimator_') + estimator = self.estimator_ scores = _get_feature_importances(estimator, self.norm_order) threshold = _calculate_threshold(estimator, scores, self.threshold) return scores >= threshold @@ -162,6 +152,9 @@ def fit(self, X, y=None, **fit_params): self : object Returns self. """ + if self.prefit is not None: + warnings.warn('Parameter prefit is deprecated and will be removed ' + 'in version 0.22. Use FreezeWrap instead.') if self.prefit: raise NotFittedError( "Since 'prefit=True', call transform directly") diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index ae4d1ba4331a6..2392117fe16c4 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -183,3 +183,6 @@ def test_threshold_without_refitting(): # Set a higher threshold to filter out more features. model.threshold = "1.0 * mean" assert_greater(X_transform.shape[1], model.transform(data).shape[1]) + + +# TODO: test deprecation of prefit and that FreezeWrap behaves similarly diff --git a/sklearn/freeze.py b/sklearn/freeze.py new file mode 100644 index 0000000000000..999384e823b05 --- /dev/null +++ b/sklearn/freeze.py @@ -0,0 +1,124 @@ +""" +Utility for making estimators frozen / un-trainable. +""" +# Author: Joel Nothman +# License: BSD + +from .base import BaseEstimator, MetaEstimatorMixin +from .utils.metaestimators import if_delegate_has_method + +__all__ = ['FreezeWrap'] + + +class FreezeWrap(BaseEstimator, MetaEstimatorMixin): + """Disable fitting and cloning for the wrapped estimator + + Wrapping an estimator in this freezes it, such that: + + * ``clone(FreezeWrap(estimator))`` will return the same model without + clearing it + * ``FreezeWrap(estimator).fit(...)`` will not call ``estimator.fit()`` + * ``FreezeWrap(estimator).fit_transform(X, y)`` will just return + ``estimator.transform(X)`` + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator + + Notes + ----- + Any keyword arguments passed to ``fit_transform``, will *not* + be passed on to ``transform`` (and similar for ``fit_predict``). + """ + + def __init__(self, estimator): + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + """Return self + + Parameters + ---------- + X + ignored + y : optional + ignored + kwargs : optional + ignored + """ + return self + + @if_delegate_has_method(delegate='estimator') + def fit_transform(self, X, y=None, **kwargs): + """Execute transform on estimator + + Parameters + ---------- + X + data to transform + y : optional + ignored + kwargs : ignored + ignored + """ + return self.estimator.transform(X) + + @if_delegate_has_method(delegate='estimator') + def fit_predict(self, X, y=None, **kwargs): + """Execute predict on estimator + + Parameters + ---------- + X + data to predict + y : optional + ignored + kwargs : ignored + ignored + """ + return self.estimator.predict(X) + + @if_delegate_has_method(delegate='estimator') + def transform(self, *args, **kwargs): + """Execute estimator's equivalent method + """ + return self.estimator.transform(*args, **kwargs) + + @if_delegate_has_method(delegate='estimator') + def decision_function(self, *args, **kwargs): + """Execute estimator's equivalent method + """ + return self.estimator.decision_function(*args, **kwargs) + + @if_delegate_has_method(delegate='estimator') + def predict(self, *args, **kwargs): + """Execute estimator's equivalent method + """ + return self.estimator.predict(*args, **kwargs) + + @if_delegate_has_method(delegate='estimator') + def predict_log_proba(self, *args, **kwargs): + """Execute estimator's equivalent method + """ + return self.estimator.predict_log_proba(*args, **kwargs) + + @if_delegate_has_method(delegate='estimator') + def predict_proba(self, *args, **kwargs): + """Execute estimator's equivalent method + """ + return self.estimator.predict_proba(*args, **kwargs) + + @property + def _estimator_type(self): + return self.estimator._estimator_type + + @property + def classes_(self): + return self.estimator.classes_ + + @property + def _pairwise(self): + # check if first estimator expects pairwise input + return getattr(self.estimator, '_pairwise', False) diff --git a/sklearn/tests/test_freeze.py b/sklearn/tests/test_freeze.py new file mode 100644 index 0000000000000..a15329ffb7f65 --- /dev/null +++ b/sklearn/tests/test_freeze.py @@ -0,0 +1,35 @@ +import pickle +import numpy as np +from sklearn import datasets +from sklearn.freeze import FreezeWrap +from sklearn.feature_selection import SelectKBest +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_true +from sklearn.utils.testing import assert_false + + +def test_freeze(): + X, y = datasets.load_iris(return_X_y=True) + + est = SelectKBest(k=1).fit(X, y) + + frozen_est = FreezeWrap(est) + + dumped = pickle.dumps(frozen_est) + frozen_est2 = pickle.loads(dumped) + assert_false(frozen_est is frozen_est2) + + # Test fit_transform where expected + assert_true(hasattr(est, 'fit_transform')) + assert_true(hasattr(frozen_est, 'fit_transform')) + assert_false(est.fit_transform is frozen_est.fit_transform) + frozen_est.fit_transform([np.arange(X.shape[1])], [0]) + + # Test fit_transform not available when not on base + est = DecisionTreeClassifier().fit(X, y) + frozen_est = FreezeWrap(est) + assert_false(hasattr(est, 'fit_transform')) + assert_false(hasattr(frozen_est, 'fit_transform')) + + # TODO: much more diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 4a33d64d69bee..276acea654efa 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -508,7 +508,8 @@ def uninstall_mldata_mock(): META_ESTIMATORS = ["OneVsOneClassifier", "MultiOutputEstimator", "MultiOutputRegressor", "MultiOutputClassifier", "OutputCodeClassifier", "OneVsRestClassifier", - "RFE", "RFECV", "BaseEnsemble", "ClassifierChain"] + "RFE", "RFECV", "BaseEnsemble", "ClassifierChain", + "FreezeWrap"] # estimators that there is no way to default-construct sensibly OTHER = ["Pipeline", "FeatureUnion", "GridSearchCV", "RandomizedSearchCV", "SelectFromModel"]