|
1 |
| -#!/usr/bin/python |
| 1 | +#!/usr/bin/env python |
2 | 2 | # -*- coding: utf-8 -*-
|
3 | 3 | """
|
4 |
| -================================================================= |
5 |
| -Selecting dimensionality reduction with Pipeline and GridSearchCV |
6 |
| -================================================================= |
| 4 | +====================================================================== |
| 5 | +Selecting dimensionality reduction with Pipeline, CachedPipeline, and\ |
| 6 | +GridSearchCV |
| 7 | +====================================================================== |
7 | 8 |
|
8 | 9 | This example constructs a pipeline that does dimensionality
|
9 | 10 | reduction followed by prediction with a support vector
|
10 |
| -classifier. It demonstrates the use of GridSearchCV and |
11 |
| -Pipeline to optimize over different classes of estimators in a |
12 |
| -single CV run -- unsupervised PCA and NMF dimensionality |
| 11 | +classifier. It demonstrates the use of ``GridSearchCV`` and |
| 12 | +``Pipeline`` to optimize over different classes of estimators in a |
| 13 | +single CV run -- unsupervised ``PCA`` and ``NMF`` dimensionality |
13 | 14 | reductions are compared to univariate feature selection during
|
14 | 15 | the grid search.
|
| 16 | +
|
| 17 | +Additionally, ``Pipeline`` can be exchanged with ``CachedPipeline`` |
| 18 | +to memoize the transformers within the pipeline, avoiding to fit |
| 19 | +again the same transformers over and over. |
| 20 | +
|
| 21 | +Note that the use of ``CachedPipeline`` becomes interesting when the |
| 22 | +fitting of a transformer is costly. |
15 | 23 | """
|
16 | 24 | # Authors: Robert McGibbon, Joel Nothman
|
17 | 25 |
|
| 26 | +############################################################################### |
| 27 | +# Illustration of ``Pipeline`` and ``GridSearchCV`` |
| 28 | +############################################################################### |
| 29 | +# This section illustrates the use of a ``Pipeline`` with |
| 30 | +# ``GridSearchCV`` |
| 31 | + |
18 | 32 | from __future__ import print_function, division
|
19 | 33 |
|
| 34 | +from tempfile import mkdtemp |
20 | 35 | import numpy as np
|
21 | 36 | import matplotlib.pyplot as plt
|
22 | 37 | from sklearn.datasets import load_digits
|
23 | 38 | from sklearn.model_selection import GridSearchCV
|
24 |
| -from sklearn.pipeline import Pipeline |
| 39 | +from sklearn.pipeline import Pipeline, CachedPipeline |
25 | 40 | from sklearn.svm import LinearSVC
|
26 | 41 | from sklearn.decomposition import PCA, NMF
|
27 | 42 | from sklearn.feature_selection import SelectKBest, chi2
|
28 |
| - |
29 |
| -print(__doc__) |
| 43 | +from sklearn.externals.joblib import Memory |
30 | 44 |
|
31 | 45 | pipe = Pipeline([
|
32 | 46 | ('reduce_dim', PCA()),
|
|
73 | 87 | plt.ylim((0, 1))
|
74 | 88 | plt.legend(loc='upper left')
|
75 | 89 | plt.show()
|
| 90 | + |
| 91 | +############################################################################### |
| 92 | +# Illustration of ``CachedPipeline`` instead of ``Pipeline`` |
| 93 | +############################################################################### |
| 94 | +# It is sometimes interesting to store the state of a specific transformer |
| 95 | +# since it could be used again. Using a pipeline in ``GridSearchCV`` triggers |
| 96 | +# such situations. Therefore, we replace ``Pipeline`` with ``CachedPipeline`` |
| 97 | +# to memoize the transfomers within the pipeline. |
| 98 | +# |
| 99 | +# .. warning:: |
| 100 | +# Note that this example is, however, only an illustration since for this |
| 101 | +# specific case fitting PCA is not necessarily slower than loading the |
| 102 | +# cache. Hence, use ``CachedPipeline`` when the fitting of a transformer |
| 103 | +# is costly. |
| 104 | + |
| 105 | +# Create a temporary folder to store the transformers of the pipeline |
| 106 | +cachedir = mkdtemp() |
| 107 | +memory = Memory(cachedir=cachedir, verbose=10) |
| 108 | +cached_pipe = CachedPipeline([('reduce_dim', PCA()), |
| 109 | + ('classify', LinearSVC())], |
| 110 | + memory=memory) |
| 111 | + |
| 112 | +# This time, a cached pipeline will be used within the grid search |
| 113 | +grid = GridSearchCV(cached_pipe, cv=3, n_jobs=2, param_grid=param_grid) |
| 114 | +digits = load_digits() |
| 115 | +grid.fit(digits.data, digits.target) |
0 commit comments