diff --git a/examples/plot_grid_search.py b/examples/plot_grid_search.py new file mode 100644 index 0000000000000..dff81b743c6d8 --- /dev/null +++ b/examples/plot_grid_search.py @@ -0,0 +1,46 @@ +""" +===================================================== +Visualizing results of high dimensional grid searches +===================================================== + +Often one is faced with combining feature extraction, feature selection +and classification into a complex pipeline. +Each individual step usually has many tunable parameters. Finding the +important parameters for a given task and picking robust settings is often +hard. + +This example show how to visualize results of a grid search with +many interacting parameters. +The ``DecisionTreeClassifier`` is a good model for a complex pipeline as there +are many parameters to tweak, but often only few have significant influence. +""" +print __doc__ + +import pylab as pl + +from sklearn.datasets import make_classification +from sklearn.grid_search import GridSearchCV +from sklearn.tree import DecisionTreeClassifier + +X, y = make_classification(n_samples=100, n_features=10, random_state=0) + +param_grid = {'max_depth': range(1, 8), 'min_samples_split': [1, 2, 3, 4], + 'max_features': [1, 3, 5, 8, 10]} + +grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, + cv=5) +grid_search.fit(X, y) + +cv_scores = grid_search.scores_ + +fig, axes = pl.subplots(1, 3) +axes = axes.ravel() +for ax, param in zip(axes, cv_scores.params): + means, errors = cv_scores.accumulate(param, 'max') + ax.boxplot(cv_scores.values[param], means, yerr=errors) + ax.set_xlabel(param) + ax.set_ylabel("accuracy") + ax.set_ylim(0.6, 0.95) +fig.set_size_inches((12, 4), forward=True) +pl.subplots_adjust(left=0.07, right=0.95, bottom=0.15, wspace=0.26) +pl.show() diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index 465098199be2c..5246f17017b0b 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -105,21 +105,24 @@ pl.axis('tight') # plot the scores of the grid -# grid_scores_ contains parameter settings and scores -score_dict = grid.grid_scores_ - -# We extract just the scores -scores = [x[1] for x in score_dict] -scores = np.array(scores).reshape(len(C_range), len(gamma_range)) +cv_scores = grid.scores_ # draw heatmap of accuracy as a function of gamma and C pl.figure(figsize=(8, 6)) pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95) -pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral) +pl.imshow(cv_scores.mean(), interpolation='nearest', cmap=pl.cm.spectral) pl.xlabel('gamma') pl.ylabel('C') -pl.colorbar() +cb = pl.colorbar() +cb.set_label("Accuracy") pl.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45) pl.yticks(np.arange(len(C_range)), C_range) +fig, axes = pl.subplots(2, 1) +for ax, param in zip(axes, cv_scores.params): + maxs, errors = cv_scores.accumulate(param, 'max') + ax.errorbar(np.arange(len(cv_scores.values[param])), maxs, + yerr=errors) + ax.set_title(param) + pl.show() diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index eccf0ac471e2e..48e5c10c0f2af 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -19,6 +19,86 @@ from .utils import check_arrays, safe_mask +class ResultGrid(object): + """Provides easy access to grid search results. + + This object is constructed by GridSearchCV and + provides an easy interface to evaluate the grid search + results. + + Attributes + ---------- + params: list of string + Lists parameters adjusted during grid-search + This is an alphabetical sorting of the keys + of the ``param_grid`` used in the GridSearchCV. + values: dict + This contains the values of the parameters + that were used during grid search. + scores: ndarray + Contains all the scores of all runs. + Each axis corresponds to the setting of one + parameter, in the order given in params. + The last axis corresponds to the folds. + """ + + def __init__(self, params, values, scores): + self.scores = scores + self.params = params + self.values = values + + def mean(self): + """Returns mean scores over folds for the whole parameter grid.""" + return np.mean(self.scores, axis=-1) + + def std(self): + """Returns standard deviation of scores over folds for the whole + parameter grid.""" + return np.std(self.scores, axis=-1) + + def accumulate(self, param, kind="max"): + """Accumulates scores over all but one parameter. + + Useful for grid searches in many parameters, where + the whole grid can not easily be visualized. + + Parameters + ---------- + param: string + Name of the parameter not to accumulate over. + kind: string, 'mean' or 'max' + Operation that is used to accumulate over all parameters + except ``param``. + + Returns + ------- + scores: ndarray + 1d array of scores corresponding to the different settings + of ``param``. + errors: ndarray + 1d array of standard deviations of scores. + """ + index = self.params.index(param) + # make interesting axis the first + n_values = len(self.values[param]) + accumulated_mean = np.rollaxis(self.mean(), index, 0) + accumulated_mean = accumulated_mean.reshape(n_values, -1) + accumulated_std = np.rollaxis(self.std(), index, 0) + accumulated_std = accumulated_std.reshape(n_values, -1) + if kind == "mean": + accumulated_mean = np.mean(accumulated_mean, axis=-1) + accumulated_std = np.mean(accumulated_std, axis=-1) + elif kind == "max": + max_inds = np.argmax(accumulated_mean, axis=-1) + inds = np.indices(max_inds.shape) + accumulated_mean = accumulated_mean[inds, max_inds].ravel() + accumulated_std = accumulated_std[inds, max_inds].ravel() + else: + raise ValueError("kind must be 'mean' or 'all', got %s." % + str(kind)) + return accumulated_mean, accumulated_std + + class IterGrid(object): """Generators on the combination of the various parameter lists given @@ -97,7 +177,6 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, loss_func, else: X_train = X[safe_mask(X, train)] X_test = X[safe_mask(X, test)] - if y is not None: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] @@ -150,8 +229,8 @@ def _check_param_grid(param_grid): raise ValueError("Parameter values should be a list.") if len(v) == 0: - raise ValueError("Parameter values should be a non-empty " - "list.") + raise ValueError("Parameter values should be " + "a non-empty list.") def _has_one_grid_point(param_grid): @@ -268,6 +347,10 @@ class GridSearchCV(BaseEstimator, MetaEstimatorMixin): `best_params_` : dict Parameter setting that gave the best results on the hold out data. + `scores_`: list of ResultGrid + For each dict in ``param_grid`` this holds a ``ResultGrid`` that + provides easy analysis of the grid search scores. + Notes ------ The parameters selected are those that maximize the score of the left out @@ -435,9 +518,38 @@ def _fit(self, X, y): self._best_estimator_ = best_estimator self._set_methods() - # Store the computed scores - # XXX: the name is too specific, it shouldn't have - # 'grid' in it. Also, we should be retrieving/storing variance + # param grid can be a list + # make singleton to list for unified treatment + if hasattr(self.param_grid, 'items'): + # wrap dictionary in a singleton list + param_grid = [self.param_grid] + else: + param_grid = self.param_grid + # for each entry in the param_grid list, we build + # an array of scores. + # we don't know how long the parts are so we have + # to keep track of everything :-/ + start = 0 + self.scores_ = [] + for one_grid in param_grid: + sorted_params = sorted(one_grid.keys()) + # get the number of values for each parameter + grid_shape = [len(one_grid[k]) for k in sorted_params] + n_entries = np.prod(grid_shape) + grid_shape.append(n_folds) + # get scores + score_array = np.array(cv_scores[start:start + n_entries]) + # reshape to fit the sequence of values + score_array = score_array.reshape(grid_shape) + self.scores_.append(ResultGrid(sorted_params, one_grid, + score_array)) + start += n_entries + + # often the list is just one grid. Make access easier + if len(self.scores_) == 1: + self.scores_ = self.scores_[0] + + # old interface self.grid_scores_ = [ (clf_params, score, all_scores) for clf_params, (score, _), all_scores diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index 37a45142bc54d..d6f8812b3bc61 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -12,6 +12,7 @@ from sklearn.grid_search import GridSearchCV from sklearn.datasets.samples_generator import make_classification from sklearn.svm import LinearSVC, SVC +from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import f1_score, precision_score from sklearn.cross_validation import KFold @@ -49,7 +50,7 @@ def test_grid_search(): assert_equal(grid_search.best_estimator_.foo_param, 2) for i, foo_i in enumerate([1, 2, 3]): - assert_true(grid_search.grid_scores_[i][0] == {'foo_param': foo_i}) + assert_equal(grid_search.grid_scores_[i][0], {'foo_param': foo_i}) # Smoke test the score: grid_search.score(X, y) @@ -225,3 +226,33 @@ def test_X_as_list(): cv = KFold(n=len(X), k=3) grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv) grid_search.fit(X.tolist(), y).score(X, y) + + +def test_result_grid(): + # make small grid search and test ResultGrid on it + clf = DecisionTreeClassifier() + X, y = make_classification() + param_grid = {'max_depth': np.arange(1, 5), + 'max_features': np.arange(1, 3)} + grid_search = GridSearchCV(clf, param_grid=param_grid) + grid_search.fit(X, y) + result = grid_search.scores_ + assert_equal(result.mean().shape, (4, 2)) + assert_equal(result.std().shape, (4, 2)) + assert_equal(result.scores.shape, (4, 2, 3)) + means, errs = result.accumulated('max_depth') + assert_equal(len(means), 4) + assert_equal(len(errs), 4) + assert_equal(len(result.values['max_depth']), 4) + + +def test_list(): + # test that grid search can handle list of dics as param_grid + # smoke test! + clf = DecisionTreeClassifier() + X, y = make_classification() + param_grid = [{'max_depth': np.arange(1, 5)}, + {'max_features': np.arange(1, 3)}] + grid_search = GridSearchCV(clf, param_grid=param_grid) + grid_search.fit(X, y) + assert_equal(len(grid_search.scores_), 2)