8000 WIP Grid search convenience class. by amueller · Pull Request #1034 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

WIP Grid search convenience class. #1034

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
46 changes: 46 additions & 0 deletions examples/plot_grid_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
=====================================================
Visualizing results of high dimensional grid searches
=====================================================

Often one is faced with combining feature extraction, feature selection
and classification into a complex pipeline.
Each individual step usually has many tunable parameters. Finding the
important parameters for a given task and picking robust settings is often
hard.

This example show how to visualize results of a grid search with
many interacting parameters.
The ``DecisionTreeClassifier`` is a good model for a complex pipeline as there
are many parameters to tweak, but often only few have significant influence.
"""
print __doc__

import pylab as pl

from sklearn.datasets import make_classification
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

X, y = make_classification(n_samples=100, n_features=10, random_state=0)

param_grid = {'max_depth': range(1, 8), 'min_samples_split': [1, 2, 3, 4],
'max_features': [1, 3, 5, 8, 10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid,
cv=5)
grid_search.fit(X, y)

cv_scores = grid_search.scores_

fig, axes = pl.subplots(1, 3)
axes = axes.ravel()
for ax, param in zip(axes, cv_scores.params):
means, errors = cv_scores.accumulate(param, 'max')
ax.boxplot(cv_scores.values[param], means, yerr=errors)
ax.set_xlabel(param)
ax.set_ylabel("accuracy")
ax.set_ylim(0.6, 0.95)
fig.set_size_inches((12, 4), forward=True)
pl.subplots_adjust(left=0.07, right=0.95, bottom=0.15, wspace=0.26)
pl.show()
19 changes: 11 additions & 8 deletions examples/svm/plot_rbf_parameters.py
ax.errorbar(np.arange(len(cv_scores.values[param])), maxs,
Original file line number Diff line number Diff line change
Expand Up @@ -105,21 +105,24 @@
pl.axis('tight')

# plot the scores of the grid
# grid_scores_ contains parameter settings and scores
score_dict = grid.grid_scores_

# We extract just the scores
scores = [x[1] for x in score_dict]
scores = np.array(scores).reshape(len(C_range), len(gamma_range))
cv_scores = grid.scores_

# draw heatmap of accuracy as a function of gamma and C
pl.figure(figsize=(8, 6))
pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95)
pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral)
pl.imshow(cv_scores.mean(), interpolation='nearest', cmap=pl.cm.spectral)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On this plot, I'd also set a title to the color bar

pl.xlabel('gamma')
pl.ylabel('C')
pl.colorbar()
cb = pl.colorbar()
cb.set_label("Accuracy")
pl.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
pl.yticks(np.arange(len(C_range)), C_range)

fig, axes = pl.subplots(2, 1)
for ax, param in zip(axes, cv_scores.params):
maxs, errors = cv_scores.accumulate(param, 'max')
yerr=errors)
ax.set_title(param)

pl.show()
124 changes: 118 additions & 6 deletions sklearn/grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,86 @@
from .utils import check_arrays, safe_mask


class ResultGrid(object):
"""Provides easy access to grid search results.

This object is constructed by GridSearchCV and
provides an easy interface to evaluate the grid search
results.

Attributes
----------
params: list of string
Lists parameters adjusted during grid-search
This is an alphabetical sorting of the keys
of the ``param_grid`` used in the GridSearchCV.
values: dict
This contains the values of the parameters
that were used during grid search.
scores: ndarray
Contains all the scores of all runs.
Each axis corresponds to the setting of one
parameter, in the order given in params.
The last axis corresponds to the folds.
"""

def __init__(self, params, values, scores):
self.scores = scores
self.params = params
self.values = values

def mean(self):
"""Returns mean scores over folds for the whole parameter grid."""
return np.mean(self.scores, axis=-1)

def std(self):
"""Returns standard deviation of scores over folds for the whole
parameter grid."""
return np.std(self.scores, axis=-1)

def accumulate(self, param, kind="max"):
"""Accumulates scores over all but one parameter.

Useful for grid searches in many parameters, where
the whole grid can not easily be visualized.

Parameters
----------
param: string
Name of the parameter not to accumulate over.
kind: string, 'mean' or 'max'
Operation that is used to accumulate over all parameters
except ``param``.

Returns
-------
scores: ndarray
1d array of scores corresponding to the different settings
of ``param``.
errors: ndarray
1d array of standard deviations of scores.
"""
index = self.params.index(param)
# make interesting axis the first
n_values = len(self.values[param])
accumulated_mean = np.rollaxis(self.mean(), index, 0)
accumulated_mean = accumulated_mean.reshape(n_values, -1)
accumulated_std = np.rollaxis(self.std(), index, 0)
accumulated_std = accumulated_std.reshape(n_values, -1)
if kind == "mean":
accumulated_mean = np.mean(accumulated_mean, axis=-1)
accumulated_std = np.mean(accumulated_std, axis=-1)
elif kind == "max":
max_inds = np.argmax(accumulated_mean, axis=-1)
inds = np.indices(max_inds.shape)
accumulated_mean = accumulated_mean[inds, max_inds].ravel()
accumulated_std = accumulated_std[inds, max_inds].ravel()
else:
raise ValueError("kind must be 'mean' or 'all', got %s." %
str(kind))
return accumulated_mean, accumulated_std


class IterGrid(object):
"""Gen 8000 erators on the combination of the various parameter lists given

Expand Down Expand Up @@ -97,7 +177,6 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, loss_func,
else:
X_train = X[safe_mask(X, train)]
X_test = X[safe_mask(X, test)]

if y is not None:
y_test = y[safe_mask(y, test)]
y_train = y[safe_mask(y, train)]
Expand Down Expand Up @@ -150,8 +229,8 @@ def _check_param_grid(param_grid):
raise ValueError("Parameter values should be a list.")

if len(v) == 0:
raise ValueError("Parameter values should be a non-empty "
"list.")
raise ValueError("Parameter values should be "
"a non-empty list.")


def _has_one_grid_point(param_grid):
Expand Down Expand Up @@ -268,6 +347,10 @@ class GridSearchCV(BaseEstimator, MetaEstimatorMixin):
`best_params_` : dict
Parameter setting that gave the best results on the hold out data.

`scores_`: list of ResultGrid
For each dict in ``param_grid`` this holds a ``ResultGrid`` that
provides easy analysis of the grid search scores.

Notes
------
The parameters selected are those that maximize the score of the left out
Expand Down Expand Up @@ -435,9 +518,38 @@ def _fit(self, X, y):
self._best_estimator_ = best_estimator
self._set_methods()

# Store the computed scores
# XXX: the name is too specific, it shouldn't have
# 'grid' in it. Also, we should be retrieving/storing variance
# param grid can be a list
# make singleton to list for unified treatment
if hasattr(self.param_grid, 'items'):
# wrap dictionary in a singleton list
param_grid = [self.param_grid]
else:
param_grid = self.param_grid
# for each entry in the param_grid list, we build
# an array of scores.
# we don't know how long the parts are so we have
# to keep track of everything :-/
start = 0
self.scores_ = []
for one_grid in param_grid:
sorted_params = sorted(one_grid.keys())
# get the number of values for each parameter
grid_shape = [len(one_grid[k]) for k in sorted_params]
n_entries = np.prod(grid_shape)
grid_shape.append(n_folds)
# get scores
score_array = np.array(cv_scores[start:start + n_entries])
# reshape to fit the sequence of values
score_array = score_array.reshape(grid_shape)
self.scores_.append(ResultGrid(sorted_params, one_grid,
score_array))
start += n_entries

# often the list is just one grid. Make access easier
if len(self.scores_) == 1:
self.scores_ = self.scores_[0]

# old interface
self.grid_scores_ = [
(clf_params, score, all_scores)
for clf_params, (score, _), all_scores
Expand Down
33 changes: 32 additions & 1 deletion sklearn/tests/test_grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from sklearn.grid_search import GridSearchCV
from sklearn.datasets.samples_generator import make_classification
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score
from sklearn.cross_validation import KFold

Expand Down Expand Up @@ -49,7 +50,7 @@ def test_grid_search():
assert_equal(grid_search.best_estimator_.foo_param, 2)

for i, foo_i in enumerate([1, 2, 3]):
assert_true(grid_search.grid_scores_[i][0] == {'foo_param': foo_i})
assert_equal(grid_search.grid_scores_[i][0], {'foo_param': foo_i})
# Smoke test the score:
grid_search.score(X, y)

Expand Down Expand Up @@ -225,3 +226,33 @@ def test_X_as_list():
cv = KFold(n=len(X), k=3)
grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv)
grid_search.fit(X.tolist(), y).score(X, y)


def test_result_grid():
# make small grid search and test ResultGrid on it
clf = DecisionTreeClassifier()
X, y = make_classification()
param_grid = {'max_depth': np.arange(1, 5),
'max_features': np.arange(1, 3)}
grid_search = GridSearchCV(clf, param_grid=param_grid)
grid_search.fit(X, y)
result = grid_search.scores_
assert_equal(result.mean().shape, (4, 2))
assert_equal(result.std().shape, (4, 2))
assert_equal(result.scores.shape, (4, 2, 3))
means, errs = result.accumulated('max_depth')
assert_equal(len(means), 4)
assert_equal(len(errs), 4)
assert_equal(len(result.values['max_depth']), 4)


def test_list():
# test that grid search can handle list of dics as param_grid
# smoke test!
clf = DecisionTreeClassifier()
X, y = make_classification()
param_grid = [{'max_depth': np.arange(1, 5)},
{'max_features': np.arange(1, 3)}]
grid_search = GridSearchCV(clf, param_grid=param_grid)
grid_search.fit(X, y)
assert_equal(len(grid_search.scores_), 2)
0