@@ -39,13 +39,10 @@ is an estimator object::
39
39
>>> from sklearn.decomposition import PCA
40
40
>>> estimators = [('reduce_dim', PCA()), ('clf', SVC())]
41
41
>>> pipe = Pipeline(estimators)
42
- >>> pipe # doctest: +NORMALIZE_WHITESPACE
43
- Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto',
44
- n_components=None, random_state=None, svd_solver='auto', tol=0.0,
45
- whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None,
46
- coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto',
47
- kernel='rbf', max_iter=-1, probability=False, random_state=None,
48
- shrinking=True, tol=0.001, verbose=False))])
42
+ >>> pipe # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
43
+ Pipeline(memory=None,
44
+ steps=[('reduce_dim', PCA(copy=True,...)),
45
+ ('clf', SVC(C=1.0,...))])
49
46
50
47
The utility function :func: `make_pipeline ` is a shorthand
51
48
for constructing pipelines;
@@ -56,7 +53,8 @@ filling in the names automatically::
56
53
>>> from sklearn.naive_bayes import MultinomialNB
57
54
>>> from sklearn.preprocessing import Binarizer
58
55
>>> make_pipeline(Binarizer(), MultinomialNB()) # doctest: +NORMALIZE_WHITESPACE
59
- Pipeline(steps=[('binarizer', Binarizer(copy=True, threshold=0.0)),
56
+ Pipeline(memory=None,
57
+ steps=[('binarizer', Binarizer(copy=True, threshold=0.0)),
60
58
('multinomialnb', MultinomialNB(alpha=1.0,
61
59
class_prior=None,
62
60
fit_prior=True))])
@@ -76,30 +74,26 @@ and as a ``dict`` in ``named_steps``::
76
74
Parameters of the estimators in the pipeline can be accessed using the
77
75
``<estimator>__<parameter> `` syntax::
78
76
79
- >>> pipe.set_params(clf__C=10) # doctest: +NORMALIZE_WHITESPACE
80
- Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto',
81
- n_components=None, random_state=None, svd_solver='auto', tol=0.0,
82
- whiten=False)), ('clf', SVC(C=10, cache_size=200, class_weight=None,
83
- coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto',
84
- kernel='rbf', max_iter=-1, probability=False, random_state=None,
85
- shrinking=True, tol=0.001, verbose=False))])
86
-
77
+ >>> pipe.set_params(clf__C=10) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
78
+ Pipeline(memory=None,
79
+ steps=[('reduce_dim', PCA(copy=True, iterated_power='auto',...)),
80
+ ('clf', SVC(C=10, cache_size=200, class_weight=None,...))])
87
81
88
82
This is particularly important for doing grid searches::
89
83
90
84
>>> from sklearn.model_selection import GridSearchCV
91
- >>> params = dict(reduce_dim__n_components=[2, 5, 10],
92
- ... clf__C=[0.1, 10, 100])
93
- >>> grid_search = GridSearchCV(pipe, param_grid=params )
85
+ >>> param_grid = dict(reduce_dim__n_components=[2, 5, 10],
86
+ ... clf__C=[0.1, 10, 100])
87
+ >>> grid_search = GridSearchCV(pipe, param_grid=param_grid )
94
88
95
89
Individual steps may also be replaced as parameters, and non-final steps may be
96
90
ignored by setting them to ``None ``::
97
91
98
92
>>> from sklearn.linear_model import LogisticRegression
99
- >>> params = dict(reduce_dim=[None, PCA(5), PCA(10)],
100
- ... clf=[SVC(), LogisticRegression()],
101
- ... clf__C=[0.1, 10, 100])
102
- >>> grid_search = GridSearchCV(pipe, param_grid=params )
93
+ >>> param_grid = dict(reduce_dim=[None, PCA(5), PCA(10)],
94
+ ... clf=[SVC(), LogisticRegression()],
95
+ ... clf__C=[0.1, 10, 100])
96
+ >>> grid_search = GridSearchCV(pipe, param_grid=param_grid )
103
97
104
98
.. topic :: Examples:
105
99
@@ -108,6 +102,7 @@ ignored by setting them to ``None``::
108
102
* :ref: `sphx_glr_auto_examples_plot_digits_pipe.py `
109
103
* :ref: `sphx_glr_auto_examples_plot_kernel_approximation.py `
110
104
* :ref: `sphx_glr_auto_examples_svm_plot_svm_anova.py `
105
+ * :ref: `sphx_glr_auto_examples_plot_compare_reduction.py `
111
106
112
107
.. topic :: See also:
113
108
@@ -124,6 +119,84 @@ i.e. if the last estimator is a classifier, the :class:`Pipeline` can be used
124
119
as a classifier. If the last estimator is a transformer, again, so is the
125
120
pipeline.
126
121
122
+ Caching transformers: avoid repeated computation
123
+ -------------------------------------------------
124
+
125
+ .. currentmodule :: sklearn.pipeline
126
+
127
+ Fitting transformers may be computationally expensive. With its
128
+ ``memory `` parameter set, :class: `Pipeline ` will cache each transformer
129
+ after calling ``fit ``.
130
+ This feature is used to avoid computing the fit transformers within a pipeline
131
+ if the parameters and input data are identical. A typical example is the case of
132
+ a grid search in which the transformers can be fitted only once and reused for
133
+ each configuration.
134
+
135
+ The parameter ``memory `` is needed in order to cache the transformers.
136
+ ``memory `` can be either a string containing the directory where to cache the
137
+ transformers or a `joblib.Memory <https://pythonhosted.org/joblib/memory.html >`_
138
+ object::
139
+
140
+ >>> from tempfile import mkdtemp
141
+ >>> from shutil import rmtree
142
+ >>> from sklearn.decomposition import PCA
143
+ >>> from sklearn.svm import SVC
144
+ >>> from sklearn.pipeline import Pipeline
145
+ >>> estimators = [('reduce_dim', PCA()), ('clf', SVC())]
146
+ >>> cachedir = mkdtemp()
147
+ >>> pipe = Pipeline(estimators, memory=cachedir)
148
+ >>> pipe # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
149
+ Pipeline(...,
150
+ steps=[('reduce_dim', PCA(copy=True,...)),
151
+ ('clf', SVC(C=1.0,...))])
152
+ >>> # Clear the cache directory when you don't need it anymore
153
+ >>> rmtree(cachedir)
154
+
155
+ .. warning :: **Side effect of caching transfomers**
156
+
157
+ Using a :class: `Pipeline ` without cache enabled, it is possible to
158
+ inspect the original instance such as::
159
+
160
+ >>> from sklearn.datasets import load_digits
161
+ >>> digits = load_digits()
162
+ >>> pca1 = PCA()
163
+ >>> svm1 = SVC()
164
+ >>> pipe = Pipeline([('reduce_dim', pca1), ('clf', svm1)])
165
+ >>> pipe.fit(digits.data, digits.target)
166
+ ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
167
+ Pipeline(memory=None,
168
+ steps=[('reduce_dim', PCA(...)), ('clf', SVC(...))])
169
+ >>> # The pca instance can be inspected directly
170
+ >>> print(pca1.components_) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
171
+ [[ -1.77484909e-19 ... 4.07058917e-18]]
172
+
173
+ Enabling caching triggers a clone of the transformers before fitting.
174
+ Therefore, the transformer instance given to the pipeline cannot be
175
+ inspected directly.
176
+ In following example, accessing the :class: `PCA ` instance ``pca2 ``
177
+ will raise an ``AttributeError `` since ``pca2 `` will be an unfitted
178
+ transformer.
179
+ Instead, use the attribute ``named_steps `` to inspect estimators within
180
+ the pipeline::
181
+
182
+ >>> cachedir = mkdtemp()
183
+ >>> pca2 = PCA()
184
+ >>> svm2 = SVC()
185
+ >>> cached_pipe = Pipeline([('reduce_dim', pca2), ('clf', svm2)],
186
+ ... memory=cachedir)
187
+ >>> cached_pipe.fit(digits.data, digits.target)
188
+ ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
189
+ Pipeline(memory=...,
190
+ steps=[('reduce_dim', PCA(...)), ('clf', SVC(...))])
191
+ >>> print(cached_pipe.named_steps['reduce_dim'].components_)
192
+ ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
193
+ [[ -1.77484909e-19 ... 4.07058917e-18]]
194
+ >>> # Remove the cache directory
195
+ >>> rmtree(cachedir)
196
+
197
+ .. topic :: Examples:
198
+
199
+ * :ref: `sphx_glr_auto_examples_plot_compare_reduction.py `
127
200
128
201
.. _feature_union :
129
202
@@ -164,15 +237,11 @@ and ``value`` is an estimator object::
164
237
>>> from sklearn.decomposition import KernelPCA
165
238
>>> estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]
166
239
>>> combined = FeatureUnion(estimators)
167
- >>> combined # doctest: +NORMALIZE_WHITESPACE
168
- FeatureUnion(n_jobs=1, transformer_list=[('linear_pca', PCA(copy=True,
169
- iterated_power='auto', n_components=None, random_state=None,
170
- svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca',
171
- KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3,
172
- eigen_solver='auto', fit_inverse_transform=False, gamma=None,
173
- kernel='linear', kernel_params=None, max_iter=None, n_components=None,
174
- n_jobs=1, random_state=None, remove_zero_eig=False, tol=0))],
175
- transformer_weights=None)
240
+ >>> combined # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
241
+ FeatureUnion(n_jobs=1,
242
+ transformer_list=[('linear_pca', PCA(copy=True,...)),
243
+ ('kernel_pca', KernelPCA(alpha=1.0,...))],
244
+ transformer_weights=None)
176
245
177
246
178
247
Like pipelines, feature unions have a shorthand constructor called
@@ -182,11 +251,12 @@ Like pipelines, feature unions have a shorthand constructor called
182
251
Like ``Pipeline ``, individual steps may be replaced using ``set_params ``,
183
252
and ignored by setting to ``None ``::
184
253
185
- >>> combined.set_params(kernel_pca=None) # doctest: +NORMALIZE_WHITESPACE
186
- FeatureUnion(n_jobs=1, transformer_list=[('linear_pca', PCA(copy=True,
187
- iterated_power='auto', n_components=None, random_state=None,
188
- svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', None)],
189
- transformer_weights=None)
254
+ >>> combined.set_params(kernel_pca=None)
255
+ ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
256
+ FeatureUnion(n_jobs=1,
257
+ transformer_list=[('linear_pca', PCA(copy=True,...)),
258
+ ('kernel_pca', None)],
259
+ transformer_weights=None)
190
260
191
261
.. topic :: Examples:
192
262
0 commit comments