From e2ab19b2c74264b0c4b0a866756416b68e9f4eac Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 17 Aug 2012 21:03:19 +0100 Subject: [PATCH 01/10] ENH added ResultGrid class, new interface to GridSearchCV results --- sklearn/grid_search.py | 72 +++++++++++++++++++++++++++++-- sklearn/tests/test_grid_search.py | 31 ++++++++++++- 2 files changed, 99 insertions(+), 4 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index eccf0ac471e2e..9d564f357dbca 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -19,6 +19,41 @@ from .utils import check_arrays, safe_mask +class ResultGrid(object): + def __init__(self, params, values, scores): + self.scores = scores + self.params = params + self.values = values + + def mean(self): + return np.mean(self.scores, axis=-1) + + def std(self): + return np.std(self.scores, axis=-1) + + def accumulated_mean(self, param, kind="mean"): + return self._accumulate(self.mean(), param, kind) + + def accumulated_std(self, param, kind="mean"): + return self._accumulate(self.std(), param, kind) + + def _accumulate(self, X, param, kind): + if kind == "mean": + acc_func = np.mean + elif kind == "max": + acc_func = np.max + else: + raise ValueError("kind must be 'mean' or 'max', got %s." % + str(kind)) + index = self.params.index(param) + accumulated = self.scores + for i in xrange(index + 1, self.scores.ndim): + accumulated = acc_func(accumulated, axis=-1) + for i in xrange(0, index): + accumulated = acc_func(accumulated, axis=0) + return accumulated + + class IterGrid(object): """Generators on the combination of the various parameter lists given @@ -435,9 +470,40 @@ def _fit(self, X, y): self._best_estimator_ = best_estimator self._set_methods() - # Store the computed scores - # XXX: the name is too specific, it shouldn't have - # 'grid' in it. Also, we should be retrieving/storing variance + # param grid can be a list + # make singleton to list for unified treatment + if hasattr(self.param_grid, 'items'): + # wrap dictionary in a singleton list + param_grid = [self.param_grid] + else: + param_grid = self.param_grid + # for each entry in the param_grid list, we build + # an array of scores. + # we don't know how long the parts are so we have + # to keep track of everything :-/ + start = 0 + self.scores_ = [] + for one_grid in param_grid: + sorted_params = sorted(one_grid.keys()) + # get the number of values for each parameter + grid_shape = [len(one_grid[k]) for k in sorted_params] + n_entries = np.prod(grid_shape) + grid_shape.append(n_folds) + # get scores + score_array = np.array(cv_scores[start:start + n_entries]) + # reshape to fit the sequence of values + score_array = score_array.reshape(grid_shape) + self.scores_.append(ResultGrid(sorted_params, one_grid, + score_array)) + start += n_entries + #from IPython.core.debugger import Tracer + #Tracer()() + + # often the list is just one grid. Make access easier + if len(self.scores_) is 1: + self.scores_ = self.scores_[0] + + # old interface self.grid_scores_ = [ (clf_params, score, all_scores) for clf_params, (score, _), all_scores diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index 37a45142bc54d..b3a30e3c4ec80 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -12,6 +12,7 @@ from sklearn.grid_search import GridSearchCV from sklearn.datasets.samples_generator import make_classification from sklearn.svm import LinearSVC, SVC +from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import f1_score, precision_score from sklearn.cross_validation import KFold @@ -49,7 +50,7 @@ def test_grid_search(): assert_equal(grid_search.best_estimator_.foo_param, 2) for i, foo_i in enumerate([1, 2, 3]): - assert_true(grid_search.grid_scores_[i][0] == {'foo_param': foo_i}) + assert_equal(grid_search.grid_scores_[i][0], {'foo_param': foo_i}) # Smoke test the score: grid_search.score(X, y) @@ -225,3 +226,31 @@ def test_X_as_list(): cv = KFold(n=len(X), k=3) grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv) grid_search.fit(X.tolist(), y).score(X, y) + + +def test_result_grid(): + # make small grid search and test ResultGrid on it + clf = DecisionTreeClassifier() + X, y = make_classification() + param_grid = {'max_depth': np.arange(1, 5), + 'max_features': np.arange(1, 3)} + grid_search = GridSearchCV(clf, param_grid=param_grid) + grid_search.fit(X, y) + result = grid_search.scores_ + assert_equal(result.mean().shape, (4, 2)) + assert_equal(result.std().shape, (4, 2)) + assert_equal(result.scores.shape, (4, 2, 3)) + assert_equal(len(result.accumulated_mean('max_depth')), 4) + assert_equal(len(result.values['max_depth']), 4) + + +def test_list(): + # test that grid search can handle list of dics as param_grid + # smoke test! + clf = DecisionTreeClassifier() + X, y = make_classification() + param_grid = [{'max_depth': np.arange(1, 5)}, + {'max_features': np.arange(1, 3)}] + grid_search = GridSearchCV(clf, param_grid=param_grid) + grid_search.fit(X, y) + assert_equal(len(grid_search.scores_), 2) From a2eae0a8acb20fa9fc13cc494c9b3cbb15ef4734 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 17 Aug 2012 21:18:11 +0100 Subject: [PATCH 02/10] DOC some docstrings --- sklearn/grid_search.py | 65 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 9d564f357dbca..d6d8ec3899cef 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -20,21 +20,86 @@ class ResultGrid(object): + """Provides easy access to grid search results. + + This object is constructed by GridSearchCV and + provides an easy interface to evaluate the grid search + results. + + Attributes + ---------- + params: list of string + Lists parameters adjusted during grid-search + This is an alphabetical sorting of the keys + of the ``param_grid`` used in the GridSearchCV. + values: dict + This contains the values of the parameters + that were used during grid search. + scores: ndarray + Contains all the scores of all runs. + Each axis corresponds to the setting of one + parameter, in the order given in params. + The last axis corresponds to the folds. + """ + def __init__(self, params, values, scores): self.scores = scores self.params = params self.values = values def mean(self): + """Returns mean scores over folds for the whole parameter grid.""" return np.mean(self.scores, axis=-1) def std(self): + """Returns standard deviation of scores over folds for the whole + parameter grid.""" return np.std(self.scores, axis=-1) def accumulated_mean(self, param, kind="mean"): + """Accumulates scores over all but one parameter. + + Useful for grid searches in many parameters, where + the whole grid can not easily be visualized. + + Parameters + ---------- + param: string + Name of the parameter not to accumulate over. + kind: string, 'mean' or 'max' + Operation that is used to accumulate over all parameters + except ``param``. + + Returns + ------- + scores: ndarray + 1d array of scores corresponding to the different settings + of ``param``. + """ + return self._accumulate(self.mean(), param, kind) def accumulated_std(self, param, kind="mean"): + """Accumulates standard deviations of scores over all but one + parameter. + + Useful for grid searches in many parameters, where + the whole grid can not easily be visualized. + + Parameters + ---------- + param: string + Name of the parameter not to accumulate over. + kind: string, 'mean' or 'max' + Operation that is used to accumulate over all parameters + except ``param``. + + Returns + ------- + scores: ndarray + 1d array of scores corresponding to the different settings + of ``param``. + """ return self._accumulate(self.std(), param, kind) def _accumulate(self, X, param, kind): From 65135935590c9dc35a3a6b50d3b60feca1a99dbe Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 17 Aug 2012 22:00:22 +0100 Subject: [PATCH 03/10] DOC started on example. --- examples/plot_grid_search.py | 46 +++++++++++++++++++++++++++++ examples/svm/plot_rbf_parameters.py | 7 +---- sklearn/grid_search.py | 5 +++- 3 files changed, 51 insertions(+), 7 deletions(-) create mode 100644 examples/plot_grid_search.py diff --git a/examples/plot_grid_search.py b/examples/plot_grid_search.py new file mode 100644 index 0000000000000..623b847b22162 --- /dev/null +++ b/examples/plot_grid_search.py @@ -0,0 +1,46 @@ +""" +===================================================== +Visualizing results of high dimensional grid searches +===================================================== + +Often one is faced with combining feature extraction, feature selection +and classification into a complex pipeline. +Each individual step usually has many tunable parameters. Finding the +important parameters for a given task and picking robust settings is often +hard. + +This example show how to visualize results of a grid search with +many interacting parameters. +The ``DecisionTreeClassifier`` is a good model for a complex pipeline as there +are many parameters to tweak, but only few have significant influence. +""" +print __doc__ + +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.datasets import load_digits +from sklearn.grid_search import GridSearchCV +from sklearn.tree import DecisionTreeClassifier + +iris = load_digits() +X, y = iris.data, iris.target + +param_grid = {'max_depth': np.arange(1, 10, 2), 'min_samples_leaf': [1, 5, 10], + 'min_samples_split': [1, 5, 10], + 'max_features': [1, 10, 30, 40, 64]} + +grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, + cv=3) +grid_search.fit(X, y) + +results = grid_search.scores_ + +fig, axes = plt.subplots(2, 2) +axes = axes.ravel() + +for ax, param in zip(axes, results.params): + ax.errorbar(results.values[param], results.accumulated_mean(param, 'max'), + yerr=results.accumulated_std(param, 'max')) + ax.set_title(param) +plt.show() diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index 465098199be2c..030241c0abc02 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -105,12 +105,7 @@ pl.axis('tight') # plot the scores of the grid -# grid_scores_ contains parameter settings and scores -score_dict = grid.grid_scores_ - -# We extract just the scores -scores = [x[1] for x in score_dict] -scores = np.array(scores).reshape(len(C_range), len(gamma_range)) +scores = grid.scores_.mean() # draw heatmap of accuracy as a function of gamma and C pl.figure(figsize=(8, 6)) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index d6d8ec3899cef..66c0eace2d0c8 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -76,7 +76,6 @@ def accumulated_mean(self, param, kind="mean"): 1d array of scores corresponding to the different settings of ``param``. """ - return self._accumulate(self.mean(), param, kind) def accumulated_std(self, param, kind="mean"): @@ -368,6 +367,10 @@ class GridSearchCV(BaseEstimator, MetaEstimatorMixin): `best_params_` : dict Parameter setting that gave the best results on the hold out data. + `scores_`: list of ResultGrid + For each dict in ``param_grid`` this holds a ``ResultGrid`` that + provides easy analysis of the grid search scores. + Notes ------ The parameters selected are those that maximize the score of the left out From b35bca57bba5ea0741b4d7c18d67352304d8d680 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 18 Aug 2012 16:08:37 +0100 Subject: [PATCH 04/10] Slight change in interface, Updated examples --- examples/plot_grid_search.py | 19 +++++------- examples/svm/plot_rbf_parameters.py | 10 ++++++- sklearn/grid_search.py | 45 +++++++---------------------- sklearn/tests/test_grid_search.py | 4 ++- 4 files changed, 31 insertions(+), 47 deletions(-) diff --git a/examples/plot_grid_search.py b/examples/plot_grid_search.py index 623b847b22162..bc8ae450ecb7a 100644 --- a/examples/plot_grid_search.py +++ b/examples/plot_grid_search.py @@ -16,31 +16,28 @@ """ print __doc__ -import numpy as np import matplotlib.pyplot as plt -from sklearn.datasets import load_digits +from sklearn.datasets import make_classification from sklearn.grid_search import GridSearchCV from sklearn.tree import DecisionTreeClassifier -iris = load_digits() -X, y = iris.data, iris.target +X, y = make_classification(n_samples=100, n_features=10) -param_grid = {'max_depth': np.arange(1, 10, 2), 'min_samples_leaf': [1, 5, 10], - 'min_samples_split': [1, 5, 10], - 'max_features': [1, 10, 30, 40, 64]} +param_grid = {'max_depth': range(1, 8), 'min_samples_leaf': [1, 2, 3, 4, 5], + 'max_features': [1, 3, 5, 8, 10]} grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, - cv=3) + cv=5) grid_search.fit(X, y) results = grid_search.scores_ -fig, axes = plt.subplots(2, 2) +fig, axes = plt.subplots(1, 3) axes = axes.ravel() for ax, param in zip(axes, results.params): - ax.errorbar(results.values[param], results.accumulated_mean(param, 'max'), - yerr=results.accumulated_std(param, 'max')) + means, errors = results.accumulated(param, 'mean') + ax.errorbar(results.values[param], means, yerr=errors) ax.set_title(param) plt.show() diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index 030241c0abc02..0fe33461df983 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -105,7 +105,8 @@ pl.axis('tight') # plot the scores of the grid -scores = grid.scores_.mean() +results = grid.scores_ +scores = results.mean() # draw heatmap of accuracy as a function of gamma and C pl.figure(figsize=(8, 6)) @@ -117,4 +118,11 @@ pl.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45) pl.yticks(np.arange(len(C_range)), C_range) +fig, axes = pl.subplots(2, 1) +for ax, param in zip(axes, results.params): + means, errors = results.accumulated(param, 'mean') + ax.errorbar(np.arange(len(results.values[param])), means, + yerr=errors) + ax.set_title(param) + pl.show() diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 66c0eace2d0c8..f507c1ec1a08e 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -56,7 +56,7 @@ def std(self): parameter grid.""" return np.std(self.scores, axis=-1) - def accumulated_mean(self, param, kind="mean"): + def accumulated(self, param, kind="mean"): """Accumulates scores over all but one parameter. Useful for grid searches in many parameters, where @@ -76,46 +76,23 @@ def accumulated_mean(self, param, kind="mean"): 1d array of scores corresponding to the different settings of ``param``. """ - return self._accumulate(self.mean(), param, kind) - - def accumulated_std(self, param, kind="mean"): - """Accumulates standard deviations of scores over all but one - parameter. - - Useful for grid searches in many parameters, where - the whole grid can not easily be visualized. - - Parameters - ---------- - param: string - Name of the parameter not to accumulate over. - kind: string, 'mean' or 'max' - Operation that is used to accumulate over all parameters - except ``param``. - - Returns - ------- - scores: ndarray - 1d array of scores corresponding to the different settings - of ``param``. - """ - return self._accumulate(self.std(), param, kind) - - def _accumulate(self, X, param, kind): if kind == "mean": - acc_func = np.mean + pass elif kind == "max": - acc_func = np.max + raise NotImplementedError() else: raise ValueError("kind must be 'mean' or 'max', got %s." % str(kind)) index = self.params.index(param) - accumulated = self.scores - for i in xrange(index + 1, self.scores.ndim): - accumulated = acc_func(accumulated, axis=-1) + accumulated_mean = self.mean() + accumulated_std = self.std() + for i in xrange(index + 1, self.scores.ndim - 1): + accumulated_mean = np.mean(accumulated_mean, axis=-1) + accumulated_std = np.mean(accumulated_std, axis=-1) for i in xrange(0, index): - accumulated = acc_func(accumulated, axis=0) - return accumulated + accumulated_mean = np.mean(accumulated_mean, axis=0) + accumulated_std = np.mean(accumulated_std, axis=0) + return accumulated_mean, accumulated_std class IterGrid(object): diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index b3a30e3c4ec80..d6f8812b3bc61 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -240,7 +240,9 @@ def test_result_grid(): assert_equal(result.mean().shape, (4, 2)) assert_equal(result.std().shape, (4, 2)) assert_equal(result.scores.shape, (4, 2, 3)) - assert_equal(len(result.accumulated_mean('max_depth')), 4) + means, errs = result.accumulated('max_depth') + assert_equal(len(means), 4) + assert_equal(len(errs), 4) assert_equal(len(result.values['max_depth']), 4) From 0d8ad7c0fe36153531115dda111b9b9e42177562 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 18 Aug 2012 17:36:09 +0100 Subject: [PATCH 05/10] trying to get "max" to work, but it is not so easy --- examples/plot_grid_search.py | 6 +++--- sklearn/grid_search.py | 27 +++++++++++++++------------ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/examples/plot_grid_search.py b/examples/plot_grid_search.py index bc8ae450ecb7a..79011dd4c3d2d 100644 --- a/examples/plot_grid_search.py +++ b/examples/plot_grid_search.py @@ -12,7 +12,7 @@ This example show how to visualize results of a grid search with many interacting parameters. The ``DecisionTreeClassifier`` is a good model for a complex pipeline as there -are many parameters to tweak, but only few have significant influence. +are many parameters to tweak, but often only few have significant influence. """ print __doc__ @@ -24,7 +24,7 @@ X, y = make_classification(n_samples=100, n_features=10) -param_grid = {'max_depth': range(1, 8), 'min_samples_leaf': [1, 2, 3, 4, 5], +param_grid = {'max_depth': range(1, 8), 'min_samples_leaf': [1, 2, 3, 4], 'max_features': [1, 3, 5, 8, 10]} grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, @@ -37,7 +37,7 @@ axes = axes.ravel() for ax, param in zip(axes, results.params): - means, errors = results.accumulated(param, 'mean') + means, errors = results.accumulated(param, 'max') ax.errorbar(results.values[param], means, yerr=errors) ax.set_title(param) plt.show() diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index f507c1ec1a08e..634ceabb416ba 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -76,23 +76,26 @@ def accumulated(self, param, kind="mean"): 1d array of scores corresponding to the different settings of ``param``. """ + index = self.params.index(param) + # make interesting axis the first + accumulated_mean = np.rollaxis(self.mean(), index, 0) + accumulated_std = np.rollaxis(self.std(), index, 0) if kind == "mean": - pass + for i in xrange(1, self.scores.ndim - 1): + accumulated_mean = np.mean(accumulated_mean, axis=-1) + accumulated_std = np.mean(accumulated_std, axis=-1) elif kind == "max": - raise NotImplementedError() + for i in xrange(1, self.scores.ndim - 1): + max_inds = np.argmax(accumulated_mean, axis=-1) + inds = np.indices(max_inds.shape) + inds = np.vstack([inds, max_inds[np.newaxis]]) + from IPython.core.debugger import Tracer + Tracer()() + accumulated_mean = accumulated_mean[gx, gy, max_inds] + accumulated_std = accumulated_std[gx, gy, max_inds] else: raise ValueError("kind must be 'mean' or 'max', got %s." % str(kind)) - index = self.params.index(param) - accumulated_mean = self.mean() - accumulated_std = self.std() - for i in xrange(index + 1, self.scores.ndim - 1): - accumulated_mean = np.mean(accumulated_mean, axis=-1) - accumulated_std = np.mean(accumulated_std, axis=-1) - for i in xrange(0, index): - accumulated_mean = np.mean(accumulated_mean, axis=0) - accumulated_std = np.mean(accumulated_std, axis=0) - return accumulated_mean, accumulated_std class IterGrid(object): From 45b296904a2daf9a246d070468fd57061bd700eb Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 18 Aug 2012 18:55:25 +0100 Subject: [PATCH 06/10] FIX for max in grid scoring, simplified --- examples/plot_grid_search.py | 5 ++--- examples/svm/plot_rbf_parameters.py | 4 ++-- sklearn/grid_search.py | 25 ++++++++++++------------- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/examples/plot_grid_search.py b/examples/plot_grid_search.py index 79011dd4c3d2d..013d4e5352b6c 100644 --- a/examples/plot_grid_search.py +++ b/examples/plot_grid_search.py @@ -24,18 +24,17 @@ X, y = make_classification(n_samples=100, n_features=10) -param_grid = {'max_depth': range(1, 8), 'min_samples_leaf': [1, 2, 3, 4], +param_grid = {'max_depth': range(1, 8), 'min_samples_split': [1, 2, 3, 4], 'max_features': [1, 3, 5, 8, 10]} grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, - cv=5) + cv=5) grid_search.fit(X, y) results = grid_search.scores_ fig, axes = plt.subplots(1, 3) axes = axes.ravel() - for ax, param in zip(axes, results.params): means, errors = results.accumulated(param, 'max') ax.errorbar(results.values[param], means, yerr=errors) diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index 0fe33461df983..bd6f66ca67d11 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -120,8 +120,8 @@ fig, axes = pl.subplots(2, 1) for ax, param in zip(axes, results.params): - means, errors = results.accumulated(param, 'mean') - ax.errorbar(np.arange(len(results.values[param])), means, + maxs, errors = results.accumulated(param, 'max') + ax.errorbar(np.arange(len(results.values[param])), maxs, yerr=errors) ax.set_title(param) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 634ceabb416ba..448a9531b205b 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -56,7 +56,7 @@ def std(self): parameter grid.""" return np.std(self.scores, axis=-1) - def accumulated(self, param, kind="mean"): + def accumulated(self, param, kind="max"): """Accumulates scores over all but one parameter. Useful for grid searches in many parameters, where @@ -78,24 +78,23 @@ def accumulated(self, param, kind="mean"): """ index = self.params.index(param) # make interesting axis the first + n_values = len(self.values[param]) accumulated_mean = np.rollaxis(self.mean(), index, 0) + accumulated_mean = accumulated_mean.reshape(n_values, -1) accumulated_std = np.rollaxis(self.std(), index, 0) + accumulated_std = accumulated_std.reshape(n_values, -1) if kind == "mean": - for i in xrange(1, self.scores.ndim - 1): - accumulated_mean = np.mean(accumulated_mean, axis=-1) - accumulated_std = np.mean(accumulated_std, axis=-1) + accumulated_mean = np.mean(accumulated_mean, axis=-1) + accumulated_std = np.mean(accumulated_std, axis=-1) elif kind == "max": - for i in xrange(1, self.scores.ndim - 1): - max_inds = np.argmax(accumulated_mean, axis=-1) - inds = np.indices(max_inds.shape) - inds = np.vstack([inds, max_inds[np.newaxis]]) - from IPython.core.debugger import Tracer - Tracer()() - accumulated_mean = accumulated_mean[gx, gy, max_inds] - accumulated_std = accumulated_std[gx, gy, max_inds] + max_inds = np.argmax(accumulated_mean, axis=-1) + inds = np.indices(max_inds.shape) + accumulated_mean = accumulated_mean[inds, max_inds].ravel() + accumulated_std = accumulated_std[inds, max_inds].ravel() else: - raise ValueError("kind must be 'mean' or 'max', got %s." % + raise ValueError("kind must be 'mean' or 'all', got %s." % str(kind)) + return accumulated_mean, accumulated_std class IterGrid(object): From 3e84b08f5a81fb112d4e590cfa4a7fb248001ab7 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 18 Aug 2012 19:05:36 +0100 Subject: [PATCH 07/10] DOC fixed docstring --- sklearn/grid_search.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 448a9531b205b..de61e5ec6e6cc 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -75,6 +75,8 @@ def accumulated(self, param, kind="max"): scores: ndarray 1d array of scores corresponding to the different settings of ``param``. + errors: ndarray + 1d array of standard deviations of scores. """ index = self.params.index(param) # make interesting axis the first From 43bed9e089cc35673b4098dd72cd3702042fe4c5 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 18 Aug 2012 19:06:30 +0100 Subject: [PATCH 08/10] FIX removed debugging code --- sklearn/grid_search.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index de61e5ec6e6cc..4edf383c554c1 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -545,8 +545,6 @@ def _fit(self, X, y): self.scores_.append(ResultGrid(sorted_params, one_grid, score_array)) start += n_entries - #from IPython.core.debugger import Tracer - #Tracer()() # often the list is just one grid. Make access easier if len(self.scores_) is 1: From aa9116614eee44b592d00b315ac918b409eca2a4 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 1 Sep 2012 00:45:19 +0200 Subject: [PATCH 09/10] COSMIT addressed @agramfort's comments. --- examples/plot_grid_search.py | 16 +++++++++------- examples/svm/plot_rbf_parameters.py | 11 +++++------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/examples/plot_grid_search.py b/examples/plot_grid_search.py index 013d4e5352b6c..981d38a6fadc0 100644 --- a/examples/plot_grid_search.py +++ b/examples/plot_grid_search.py @@ -16,7 +16,7 @@ """ print __doc__ -import matplotlib.pyplot as plt +import pylab as pl from sklearn.datasets import make_classification from sklearn.grid_search import GridSearchCV @@ -31,12 +31,14 @@ cv=5) grid_search.fit(X, y) -results = grid_search.scores_ +cv_scores = grid_search.scores_ -fig, axes = plt.subplots(1, 3) +fig, axes = pl.subplots(1, 3) axes = axes.ravel() -for ax, param in zip(axes, results.params): - means, errors = results.accumulated(param, 'max') - ax.errorbar(results.values[param], means, yerr=errors) +for ax, param in zip(axes, cv_scores.params): + means, errors = cv_scores.accumulated(param, 'max') + ax.errorbar(cv_scores.values[param], means, yerr=errors) ax.set_title(param) -plt.show() +fig.set_size_inches((12, 4), forward=True) +pl.subplots_adjust(left=0.05, right=0.95) +pl.show() diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index bd6f66ca67d11..dc13a4dbda979 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -105,13 +105,12 @@ pl.axis('tight') # plot the scores of the grid -results = grid.scores_ -scores = results.mean() +cv_scores = grid.scores_ # draw heatmap of accuracy as a function of gamma and C pl.figure(figsize=(8, 6)) pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95) -pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral) +pl.imshow(cv_scores.mean(), interpolation='nearest', cmap=pl.cm.spectral) pl.xlabel('gamma') pl.ylabel('C') pl.colorbar() @@ -119,9 +118,9 @@ pl.yticks(np.arange(len(C_range)), C_range) fig, axes = pl.subplots(2, 1) -for ax, param in zip(axes, results.params): - maxs, errors = results.accumulated(param, 'max') - ax.errorbar(np.arange(len(results.values[param])), maxs, +for ax, param in zip(axes, cv_scores.params): + maxs, errors = cv_scores.accumulated(param, 'max') + ax.errorbar(np.arange(len(cv_scores.values[param])), maxs, yerr=errors) ax.set_title(param) From 3b14a67c6a306a04d9daa70d8fd8387ce09bcd06 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sun, 2 Sep 2012 12:40:01 +0200 Subject: [PATCH 10/10] COSMIT rename accumulated to accumulate, some adjustments in figures. --- examples/plot_grid_search.py | 12 +++++++----- examples/svm/plot_rbf_parameters.py | 5 +++-- sklearn/grid_search.py | 9 ++++----- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/examples/plot_grid_search.py b/examples/plot_grid_search.py index 981d38a6fadc0..dff81b743c6d8 100644 --- a/examples/plot_grid_search.py +++ b/examples/plot_grid_search.py @@ -22,7 +22,7 @@ from sklearn.grid_search import GridSearchCV from sklearn.tree import DecisionTreeClassifier -X, y = make_classification(n_samples=100, n_features=10) +X, y = make_classification(n_samples=100, n_features=10, random_state=0) param_grid = {'max_depth': range(1, 8), 'min_samples_split': [1, 2, 3, 4], 'max_features': [1, 3, 5, 8, 10]} @@ -36,9 +36,11 @@ fig, axes = pl.subplots(1, 3) axes = axes.ravel() for ax, param in zip(axes, cv_scores.params): - means, errors = cv_scores.accumulated(param, 'max') - ax.errorbar(cv_scores.values[param], means, yerr=errors) - ax.set_title(param) + means, errors = cv_scores.accumulate(param, 'max') + ax.boxplot(cv_scores.values[param], means, yerr=errors) + ax.set_xlabel(param) + ax.set_ylabel("accuracy") + ax.set_ylim(0.6, 0.95) fig.set_size_inches((12, 4), forward=True) -pl.subplots_adjust(left=0.05, right=0.95) +pl.subplots_adjust(left=0.07, right=0.95, bottom=0.15, wspace=0.26) pl.show() diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index dc13a4dbda979..5246f17017b0b 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -113,13 +113,14 @@ pl.imshow(cv_scores.mean(), interpolation='nearest', cmap=pl.cm.spectral) pl.xlabel('gamma') pl.ylabel('C') -pl.colorbar() +cb = pl.colorbar() +cb.set_label("Accuracy") pl.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45) pl.yticks(np.arange(len(C_range)), C_range) fig, axes = pl.subplots(2, 1) for ax, param in zip(axes, cv_scores.params): - maxs, errors = cv_scores.accumulated(param, 'max') + maxs, errors = cv_scores.accumulate(param, 'max') ax.errorbar(np.arange(len(cv_scores.values[param])), maxs, yerr=errors) ax.set_title(param) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 4edf383c554c1..48e5c10c0f2af 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -56,7 +56,7 @@ def std(self): parameter grid.""" return np.std(self.scores, axis=-1) - def accumulated(self, param, kind="max"): + def accumulate(self, param, kind="max"): """Accumulates scores over all but one parameter. Useful for grid searches in many parameters, where @@ -177,7 +177,6 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, loss_func, else: X_train = X[safe_mask(X, train)] X_test = X[safe_mask(X, test)] - if y is not None: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] @@ -230,8 +229,8 @@ def _check_param_grid(param_grid): raise ValueError("Parameter values should be a list.") if len(v) == 0: - raise ValueError("Parameter values should be a non-empty " - "list.") + raise ValueError("Parameter values should be " + "a non-empty list.") def _has_one_grid_point(param_grid): @@ -547,7 +546,7 @@ def _fit(self, X, y): start += n_entries # often the list is just one grid. Make access easier - if len(self.scores_) is 1: + if len(self.scores_) == 1: self.scores_ = self.scores_[0] # old interface