10000 Some insufficient tests and docs · scikit-learn/scikit-learn@db39cc5 · GitHub
[go: up one dir, main page]

Skip to content

Commit db39cc5

Browse files
committed
Some insufficient tests and docs
1 parent eb535b6 commit db39cc5

File tree

8 files changed

+74
-31
lines changed

8 files changed

+74
-31
lines changed

doc/data_transforms.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,4 @@ scikit-learn.
3333
modules/kernel_approximation
3434
modules/metrics
3535
modules/preprocessing_targets
36+
modules/freeze

doc/modules/freeze.rst

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
.. _frozen:
2+
3+
Frozen estimators and transfer learning
4+
=======================================
5+
.. currentmodule:: sklearn
6+
7+
It can be useful to pre-fit an estimator before including it in a Pipeline,
8+
FeatureUnion or other meta-estimators. Example applications include:
9+
10+
* transfer learning: incorporating a transformer trained on a large unlabelled
11+
dataset in a prediction pipeline where the data to be modelled is much smaller
12+
* feature selection on the basis of an already fitted predictive model
13+
14+
To enable this, your estimator can be wrapped in :class:`freeze.FreezeWrap`.
15+
For example::
16+
17+
Without transfer learning
18+
19+
>>> from sklearn.datasets import load_...
20+
>>> from sklearn.model_selection import cross_val_score
21+
>>> cross_val_score(make_pipeline(TfidfVectorizer(), LogisticRegression()),
22+
... X, y)
23+
24+
With transfer learning:
25+
>>> from sklearn.freeze import FreezeWrap
26+
>>> tfidf = TfidfVectorizer().fit(large_X)
27+
>>> cross_val_score(make_pipeline(FreezeWrap(tfidf), LogisticRegression()),
28+
... X, y)
29+
30+
In particular, calling ``FrezeWrap(tfidf).fit(X, y)`` now does nothing,
31+
while calling ``FrezeWrap(tfidf).fit_transform(X, y)`` just returns the result of
32+
``tfidf.transform(X)``.
33+
34+
.. note::
35+
When an estimator is frozen, calling :func:`clone` on it will return
36+
itself.::
37+
38+
>>> from base import clone
39+
>>> frozen = FreezeWrap(tfidf)
40+
>>> clone(frozen) is frozen
41+
True
42+
43+
This allows the model to be left untouched in cross-validation and
44+
meta-estimators which clear the estimator with ``clone``.
45+
46+
.. warning:: Leakage:
47+
Please take care to not introduce data leakage by this method: do not
48+
incorporate your test set into the training of some frozen component,
49+
unless it would be realistic to do so in the target application.

sklearn/calibration.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
4545
base_estimator : instance BaseEstimator
4646
The classifier whose output decision function needs to be calibrated
4747
to offer more accurate predict_proba outputs. If cv=prefit, the
48-
classifier must have been fit already on data.
48+
classifier must have been fit already on data, and it is recommended
49+
that the classifier be frozen (see :ref:`frozen`) in this case.
4950
5051
method : 'sigmoid' or 'isotonic'
5152
The method to use for calibration. Can be 'sigmoid' which

sklearn/ensemble/voting_classifier.py

Lines changed: 3 additions & 1 deletion
32
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
def _parallel_fit_estimator(estimator, X, y, sample_weight):
2727
"""Private function used to fit an estimator within a job."""
2828
if sample_weight is not None:
29-
estimator.fit(X, y, sample_weight)
29+
estimator.fit(X, y, sample_weight=sample_weight)
3030
else:
3131
estimator.fit(X, y)
32
return estimator
@@ -47,6 +47,8 @@ class VotingClassifier(_BaseComposition, ClassifierMixin, TransformerMixin):
4747
``self.estimators_``. An estimator can be set to `None` using
4848
``set_params``.
4949
50+
Some of these estimators may be frozen (see :ref:`frozen`).
51+
5052
voting : str, {'hard', 'soft'} (default='hard')
5153
If 'hard', uses predicted class labels for majority rule voting.
5254
Else if 'soft', predicts the class label based on the argmax of

sklearn/feature_selection/from_model.py

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# Authors: Gilles Louppe, Mathieu Blondel, Maheshakya Wijewardena
22
# License: BSD 3 clause
33

4+
import warnings
5+
46
import numpy as np
57

68
from .base import SelectorMixin
@@ -86,9 +88,10 @@ class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin):
8688
----------
8789
estimator : object
8890
The base estimator from which the transformer is built.
89-
This can be both a fitted (if ``prefit`` is set to True)
90-
or a non-fitted estimator. The estimator must have either a
91-
``feature_importances_`` or ``coef_`` attribute after fitting.
91+
The estimator must have either a ``feature_importances_``
92+
or ``coef_`` attribute after fitting.
93+
94+
Use :class:`freeze.FreezeWrap` if your estimator is already fitted.
9295
9396
threshold : string, float, optional default None
9497
The threshold value to use for feature selection. Features whose
@@ -100,14 +103,6 @@ class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin):
100103
or implicitly (e.g, Lasso), the threshold used is 1e-5.
101104
Otherwise, "mean" is used by default.
102105
103-
prefit : bool, default False
104-
Whether a prefit model is expected to be passed into the constructor
105-
directly or not. If True, ``transform`` must be called directly
106-
and SelectFromModel cannot be used with ``cross_val_score``,
107-
``GridSearchCV`` and similar utilities that clone the estimator.
108-
Otherwise train the model using ``fit`` and then ``transform`` to do
109-
feature selection.
110-
111106
norm_order : non-zero int, inf, -inf, default 1
112107
Order of the norm used to filter the vectors of coefficients below
113108
``threshold`` in the case where the ``coef_`` attribute of the
@@ -117,28 +112,22 @@ class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin):
117112
----------
118113
estimator_ : an estimator
119114
The base estimator from which the transformer is built.
120-
This is stored only when a non-fitted estimator is passed to the
121-
``SelectFromModel``, i.e when prefit is False.
122115
123116
threshold_ : float
124117
The threshold value used for feature selection.
125118
"""
126-
def __init__(self, estimator, threshold=None, prefit=False, norm_order=1):
119+
def __init__(self, estimator, threshold=None, prefit=None, norm_order=1):
127120
self.estimator = estimator
128121
self.threshold = threshold
129122
self.prefit = prefit
130123
self.norm_order = norm_order
131124

132125
def _get_support_mask(self):
133-
# SelectFromModel can directly call on transform.
134126
if self.prefit:
135127
estimator = self.estimator
136-
elif hasattr(self, 'estimator_'):
137-
estimator = self.estimator_
138128
else:
139-
raise ValueError(
140-
'Either fit SelectFromModel before transform or set "prefit='
141-
'True" and pass a fitted estimator to the constructor.')
129+
from ..utils.validation import check_is_fitted
130+
check_is_fitted(self, 'estimator_')
142131
scores = _get_feature_importances(estimator, self.norm_order)
143132
threshold = _calculate_threshold(estimator, scores, self.threshold)
144133
return scores >= threshold
@@ -162,6 +151,9 @@ def fit(self, X, y=None, **fit_params):
162151
self : object
163152
Returns self.
164153
"""
154+
if self.prefit is not None:
155+
warnings.warn('Parameter prefit is deprecated and will be removed '
156+
'in version 0.22. Use FreezeWrap instead.')
165157
if self.prefit:
166158
raise NotFittedError(
167159
"Since 'prefit=True', call transform directly")

sklearn/feature_selection/tests/test_from_model.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,3 +183,6 @@ def test_threshold_without_refitting():
183183
# Set a higher threshold to filter out more features.
184184
model.threshold = "1.0 * mean"
185185
assert_greater(X_transform.shape[1], model.transform(data).shape[1])
186+
187+
188+
# TODO: test deprecation of prefit and that FreezeWrap behaves similarly

sklearn/freeze.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class FreezeWrap(BaseEstimator):
1515
"""
1616

1717
def __init__(self, estimator):
18-
self.estimator
18+
self.estimator = estimator
1919

2020
def fit(self, X, y=None, **kwargs):
2121
"""Return self

sklearn/tests/test_freeze.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,17 @@ def test_freeze():
1919
dumped = pickle.dumps(frozen_est)
2020
frozen_est2 = pickle.loads(dumped)
2121
assert_false(frozen_est is frozen_est2)
22-
assert_array_equal(est.scores_, frozen_est2.scores_)
23-
24-
# scores should be unaffected by new fit
25-
assert_true(frozen_est2.fit() is frozen_est2)
26-
assert_array_equal(est.scores_, frozen_est2.scores_)
2722

2823
# Test fit_transform where expected
2924
assert_true(hasattr(est, 'fit_transform'))
3025
assert_true(hasattr(frozen_est, 'fit_transform'))
3126
assert_false(est.fit_transform is frozen_est.fit_transform)
3227
frozen_est.fit_transform([np.arange(X.shape[1])], [0])
33-
# scores should be unaffected by new fit_transform
34-
assert_array_equal(est.scores_, frozen_est.scores_)
3528

36-
# Test fit_transform not set when not needed
29+
# Test fit_transform not available when not on base
3730
est = DecisionTreeClassifier().fit(X, y)
3831
frozen_est = FreezeWrap(est)
3932
assert_false(hasattr(est, 'fit_transform'))
4033
assert_false(hasattr(frozen_est, 'fit_transform'))
34+
35+
# TODO: much more

0 commit comments

Comments
 (0)
0