From d884180243eef3e64fde1fa0b19190c77f88c501 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 11 Mar 2013 21:45:33 +0100 Subject: [PATCH 01/12] ENH add training score to GridSearchCV.cv_scores_ add docstring for GridSearchCV, RandomizedSearchCV and fit_grid_point. In "fit_grid_point" I used test_score rather than validation_score, as the split is given to the function. rbf svm grid search example now also shows training scores - which illustrates overfitting for high C, and training/prediction times... which pasically serve to illustrate that this is possible. Maybe random forests would be better to evaluate training times? --- .../statistical_inference/model_selection.rst | 2 +- examples/svm/plot_rbf_parameters.py | 67 +++++-- sklearn/grid_search.py | 165 ++++++++++++------ sklearn/tests/test_grid_search.py | 20 ++- 4 files changed, 183 insertions(+), 71 deletions(-) diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst index 65ae1c431466e..ec651f3f1dd09 100644 --- a/doc/tutorial/statistical_inference/model_selection.rst +++ b/doc/tutorial/statistical_inference/model_selection.rst @@ -144,7 +144,7 @@ estimator during the construction and exposes an estimator API:: >>> clf = GridSearchCV(estimator=svc, param_grid=dict(gamma=gammas), ... n_jobs=-1) >>> clf.fit(X_digits[:1000], y_digits[:1000]) # doctest: +ELLIPSIS - GridSearchCV(cv=None,... + GridSearchCV(compute_training_score=False,... >>> clf.best_score_ 0.98899999999999999 >>> clf.best_estimator_.gamma diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index f298ebf01205c..664c13bb5f6fc 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -14,10 +14,30 @@ the decision surface smooth, while a high C aims at classifying all training examples correctly. -Two plots are generated. The first is a visualization of the -decision function for a variety of parameter values, and the second -is a heatmap of the classifier's cross-validation accuracy as -a function of `C` and `gamma`. +Two plots are generated. The first is a visualization of the decision function +for a variety of parameter values, and the second is a heatmap of the +classifier's cross-validation accuracy and training time as a function of `C` +and `gamma`. + +An interesting observation on overfitting can be made when comparing validation +and training error: higher C always result in lower training error, as it +inceases complexity of the classifier. + +For the validation set on the other hand, there is a tradeoff of goodness of +fit and generalization. + +We can observe that the lower right half of the parameters (below the diagonal +with high C and gamma values) is characteristic of parameters that yields an +overfitting model: the trainin score is very high but there is a wide gap. The +top and left parts of the parameter plots show underfitting models: the C and +gamma values can individually or in conjunction constrain the model too much +leading to low training scores (hence low validation scores too as validation +scores are on average upper bounded by training scores). + + +We can also see that the training time is quite sensitive to the parameter +setting, while the prediction time is not impacted very much. This is probably +a consequence of the small size of the data set. ''' print(__doc__) @@ -65,7 +85,8 @@ gamma_range = 10.0 ** np.arange(-5, 4) param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedKFold(y=Y, n_folds=3) -grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv) +grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv, + compute_training_score=True) grid.fit(X, Y) print("The best classifier is: ", grid.best_estimator_) @@ -108,18 +129,28 @@ # cv_scores_ contains parameter settings and scores score_dict = grid.cv_scores_ -# We extract just the scores -scores = [x[1] for x in score_dict] -scores = np.array(scores).reshape(len(C_range), len(gamma_range)) - -# draw heatmap of accuracy as a function of gamma and C -pl.figure(figsize=(8, 6)) -pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95) -pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral) -pl.xlabel('gamma') -pl.ylabel('C') -pl.colorbar() -pl.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45) -pl.yticks(np.arange(len(C_range)), C_range) +# We extract validation and training scores, as well as training and prediction +# times +_, val_scores, _, train_scores, train_time, pred_time = zip(*score_dict) + +arrays = [val_scores, train_scores, train_time, pred_time] +titles = ["Validation Score", "Training Score", "Training Time", + "Prediction Time"] + +# for each value draw heatmap as a function of gamma and C +pl.figure(figsize=(12, 8)) +for i, (arr, title) in enumerate(zip(arrays, titles)): + pl.subplot(2, 2, i + 1) + arr = np.array(arr).reshape(len(C_range), len(gamma_range)) + pl.title(title) + pl.imshow(arr, interpolation='nearest', cmap=pl.cm.spectral) + pl.xlabel('gamma') + pl.ylabel('C') + pl.colorbar() + pl.xticks(np.arange(len(gamma_range)), ["%.e" % g for g in gamma_range], + rotation=45) + pl.yticks(np.arange(len(C_range)), ["%.e" % C for C in C_range]) + +pl.subplots_adjust(top=.95, hspace=.35, left=.0, right=.8, wspace=.05) pl.show() diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 171f63b53caf1..5b6e62c71184d 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -8,9 +8,9 @@ # Gael Varoquaux # License: BSD Style. -import time import warnings import numbers +from time import time from itertools import product from collections import namedtuple from abc import ABCMeta, abstractmethod @@ -170,8 +170,8 @@ def __iter__(self): yield params -def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, - verbose, loss_func=None, **fit_params): +def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, verbose, + loss_func=None, compute_training_score=False, **fit_params): """Run fit on one set of parameters. Parameters @@ -198,6 +198,9 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, If provided must be a scoring object / function with signature ``scorer(estimator, X, y)``. + compute_training_score : bool, default=False + Whether to compute the training loss. If False, None is returned. + verbose : int Verbosity level. @@ -207,8 +210,18 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, Returns ------- - score : float - Score of this parameter setting on given training / test split. + test_score : float + Test score of this parameter setting on given training / test split. + + training_score : float or None + Training score of this parameter setting or None if + ``compute_training_score=False`` (default). + + training_time : float + Training time for this parameter setting in seconds. + + prediction_time : float + Prediction time for the given test set in seconds. estimator : estimator object Estimator object of type base_clf that was fitted using clf_params @@ -218,7 +231,7 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, Number of test samples in this split. """ if verbose > 1: - start_time = time.time() + start_time = time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in clf_params.items())) print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')) @@ -249,34 +262,49 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, X_train = X[safe_mask(X, train)] X_test = X[safe_mask(X, test)] + score_func = (clf.score if scorer is None + else lambda X_, y_: scorer(clf, X_, y_)) + if y is not None: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] + start = time() + # do actual fitting clf.fit(X_train, y_train, **fit_params) - - if scorer is not None: - this_score = scorer(clf, X_test, y_test) - else: - this_score = clf.score(X_test, y_test) + training_time = time() - start + start = time() + test_score = score_func(X_test, y_test) + predict_time = time() - start else: + start = time() + # do actual fitting clf.fit(X_train, **fit_params) - if scorer is not None: - this_score = scorer(clf, X_test) + training_time = time() - start + start = time() + test_score = score_func(X_test) + predict_time = time() - start + + if compute_training_score: + if y is not None: + training_score = score_func(X_train, y_train) else: - this_score = clf.score(X_test) + training_score = score_func(X_train) + else: + training_score = None - if not isinstance(this_score, numbers.Number): + if not isinstance(test_score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s)" - " instead." % (str(this_score), type(this_score))) + " instead." % (str(test_score), type(test_score))) if verbose > 2: - msg += ", score=%f" % this_score + msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, - logger.short_format_time(time.time() - + logger.short_format_time(time() - start_time)) print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) - return this_score, clf_params, _num_samples(X_test) + return (test_score, training_score, training_time, predict_time, + clf_params, _num_samples(X_test)) def _check_param_grid(param_grid): @@ -317,8 +345,10 @@ class BaseSearchCV(BaseEstimator, MetaEstimatorMixin): @abstractmethod def __init__(self, estimator, scoring=None, loss_func=None, score_func=None, fit_params=None, n_jobs=1, iid=True, - refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'): + refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', + compute_training_score=False): + self.compute_training_score = compute_training_score self.scoring = scoring self.estimator = estimator self.loss_func = loss_func @@ -425,33 +455,60 @@ def _fit(self, X, y, parameter_iterator, **params): pre_dispatch=pre_dispatch)( delayed(fit_grid_point)( X, y, base_clf, clf_params, train, test, scorer, - self.verbose, **self.fit_params) for clf_params in + self.verbose, + compute_training_score=self.compute_training_score, + **self.fit_params) for clf_params in parameter_iterator for train, test in cv) - + # type and list for storing results + CVScoreTuple = namedtuple('CVScoreTuple', + ('parameters', 'mean_validation_score', + 'cv_validation_scores', + 'mean_training_score', 'training_time', + 'prediction_time')) + cv_scores = [] # Out is a list of triplet: score, estimator, n_test_samples n_param_points = len(list(parameter_iterator)) n_fits = len(out) n_folds = n_fits // n_param_points - scores = list() - cv_scores = list() for start in range(0, n_fits, n_folds): n_test_samples = 0 - mean_validation_score = 0 - these_points = list() - for this_score, clf_params, this_n_test_samples in \ - out[start:start + n_folds]: - these_points.append(this_score) + mean_validation_score, mean_training_score = 0, 0 + # lists for accumulating statistics over fold + test_points, training_times, prediction_times = [], [], [] + for (test_score, training_score, training_time, prediction_time, + clf_params, this_n_test_samples) in out[start:start + + n_folds]: + test_points.append(test_score) + training_times.append(training_time) + prediction_times.append(prediction_time) if self.iid: - this_score *= this_n_test_samples - mean_validation_score += this_score + test_score *= this_n_test_samples + # assumes n_train + n_test = len(X) + mean_validation_score += test_score + + if self.compute_training_score: + if self.iid: + training_score *= n_samples - this_n_test_samples + mean_training_score += training_score + n_test_samples += this_n_test_samples + if self.iid: mean_validation_score /= float(n_test_samples) - scores.append((mean_validation_score, clf_params)) - cv_scores.append(these_points) - cv_scores = np.asarray(cv_scores) + if self.compute_training_score: + if self.iid: + # again, we assume n_train + n_test = len(X) + mean_training_score /= (n_folds * n_samples + - float(n_test_samples)) + else: + mean_training_score = None + + cv_scores.append(CVScoreTuple( + clf_params, mean_validation_score, + test_points, mean_training_score, + np.mean(training_times), np.mean(prediction_times))) # Note: we do not use max(out) to make ties deterministic even if # comparison on estimator instances is not deterministic @@ -465,14 +522,17 @@ def _fit(self, X, y, parameter_iterator, **params): else: best_score = np.inf - for score, params in scores: + for point in cv_scores: + score = point.mean_validation_score if ((score > best_score and greater_is_better) - or (score < best_score and not greater_is_better)): + or (score < best_score + and not greater_is_better)): best_score = score - best_params = params + best_params = point.parameters self.best_params_ = best_params self.best_score_ = best_score + self.cv_scores_ = cv_scores if self.refit: # fit the best estimator using the entire dataset @@ -485,14 +545,6 @@ def _fit(self, X, y, parameter_iterator, **params): self.best_estimator_ = best_estimator self._set_methods() - # Store the computed scores - CVScoreTuple = namedtuple('CVScoreTuple', ('parameters', - 'mean_validation_score', - 'cv_validation_scores')) - self.cv_scores_ = [ - CVScoreTuple(clf_params, score, all_scores) - for clf_params, (score, _), all_scores - in zip(parameter_iterator, scores, cv_scores)] return self @@ -572,7 +624,7 @@ class GridSearchCV(BaseSearchCV): >>> clf = grid_search.GridSearchCV(svr, parameters) >>> clf.fit(iris.data, iris.target) ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS - GridSearchCV(cv=None, + GridSearchCV(compute_training_score=False, cv=None, estimator=SVC(C=1.0, cache_size=..., coef0=..., degree=..., gamma=..., kernel='rbf', max_iter=-1, probability=False, shrinking=True, tol=...), @@ -591,6 +643,12 @@ class GridSearchCV(BaseSearchCV): * ``mean_validation_score``, the mean score over the cross-validation folds * ``cv_validation_scores``, the list of scores for each fold + * ``mean_training_score``, the mean of the training score + over cross-validation folds. Only available if + ``compute_training_score=True``. + * ``training_time``, the mean training time in seconds. + * ``prediction_time``, the mean prediction time over the test set + in seconds. `best_estimator_` : estimator Estimator that was choosen by grid search, i.e. estimator @@ -630,10 +688,11 @@ class GridSearchCV(BaseSearchCV): def __init__(self, estimator, param_grid, scoring=None, loss_func=None, score_func=None, fit_params=None, n_jobs=1, iid=True, - refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'): + refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', + compute_training_score=False): super(GridSearchCV, self).__init__( estimator, scoring, loss_func, score_func, fit_params, n_jobs, iid, - refit, cv, verbose, pre_dispatch) + refit, cv, verbose, pre_dispatch, compute_training_score) self.param_grid = param_grid _check_param_grid(param_grid) @@ -766,6 +825,12 @@ class RandomizedSearchCV(BaseSearchCV): * ``mean_validation_score``, the mean score over the cross-validation folds * ``cv_validation_scores``, the list of scores for each fold + * ``mean_training_score``, the mean of the training score + over cross-validation folds. Only available if + ``compute_training_score=True``. + * ``training_time``, the mean training time in seconds. + * ``prediction_time``, the mean prediction time over the test set + in seconds. `best_estimator_` : estimator Estimator that was choosen by search, i.e. estimator @@ -807,13 +872,13 @@ class RandomizedSearchCV(BaseSearchCV): def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, loss_func=None, score_func=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, - pre_dispatch='2*n_jobs'): + pre_dispatch='2*n_jobs', compute_training_score=False): self.param_distributions = param_distributions self.n_iter = n_iter super(RandomizedSearchCV, self).__init__( estimator, scoring, loss_func, score_func, fit_params, n_jobs, iid, - refit, cv, verbose, pre_dispatch) + refit, cv, verbose, pre_dispatch, compute_training_score) def fit(self, X, y=None, **params): """Run fit on the estimator with randomly drawn parameters. diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index ce3252022558b..a0951027bfa60 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -12,6 +12,7 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_almost_equal @@ -281,6 +282,21 @@ def test_grid_search_precomputed_kernel_error_kernel_function(): assert_raises(ValueError, cv.fit, X_, y_) +def test_grid_search_training_score(): + # test that the training score contains sensible numbers + X, y = make_classification(n_samples=200, n_features=100, random_state=0) + clf = LinearSVC(random_state=0) + cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, compute_training_score=True) + cv.fit(X, y) + for grid_point in cv.cv_scores_: + assert_greater(grid_point.mean_training_score, + grid_point.mean_validation_score) + # hacky greater-equal + assert_greater(1 + 1e-10, grid_point.mean_training_score) + assert_greater(grid_point.training_time, 0) + assert_greater(grid_point.prediction_time, 0) + + class BrokenClassifier(BaseEstimator): """Broken classifier that cannot be fit twice""" @@ -378,9 +394,9 @@ def test_grid_search_score_consistency(): grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score) grid_search.fit(X, y) cv = StratifiedKFold(n_folds=3, y=y) - for C, scores in zip(Cs, grid_search.cv_scores_): + for C, result in zip(Cs, grid_search.cv_scores_): clf.set_params(C=C) - scores = scores[2] # get the separate runs from grid scores + scores = result[2] # get the separate runs from grid scores i = 0 for train, test in cv: clf.fit(X[train], y[train]) From c9d45a3444e6474901dca13eaeaef83b708bd969 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 12 Mar 2013 17:00:24 +1100 Subject: [PATCH 02/12] ENH Enhanced results from cross-validation via Scorer.store Currently, no tests have been added, and backwards compatibility is eschewed --- sklearn/grid_search.py | 73 ++++++++++++++++--------------- sklearn/metrics/scorer.py | 61 +++++++++++++++++++++++++- sklearn/tests/test_grid_search.py | 5 +-- 3 files changed, 98 insertions(+), 41 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index aa51e267ac32b..8a570bbb640e5 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -249,22 +249,32 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, X_train = X[safe_mask(X, train)] X_test = X[safe_mask(X, test)] + results = {'n_test_samples': _num_samples(X_test)} + if scorer is not None: + if hasattr(scorer, 'store'): + def store_score(*args): + scorer.store(results, clf, *args) + else: + def store_score(*args): + results[Scorer.SCORE_KEY] = scorer(*args) + else: + def store_score(*args): + results[Scorer.SCORE_KEY] = clf.score(*args) + if y is not None: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] clf.fit(X_train, y_train, **fit_params) - if scorer is not None: - this_score = scorer(clf, X_test, y_test) - else: - this_score = clf.score(X_test, y_test) + store_score(X_test, y_test) else: clf.fit(X_train, **fit_params) - if scorer is not None: - this_score = scorer(clf, X_test) - else: - this_score = clf.score(X_test) + store_score(X_test) + if Scorer.SCORE_KEY not in results: + raise ValueError("Scorer.store must set the key '%s' in results." + " Got %s instead." % (Scorer.SCORE_KEY, results)) + this_score = results[Scorer.SCORE_KEY] if not isinstance(this_score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s)" " instead." % (str(this_score), type(this_score))) @@ -276,7 +286,7 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, logger.short_format_time(time.time() - start_time)) print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) - return this_score, clf_params, _num_samples(X_test) + return clf_params, results def _check_param_grid(param_grid): @@ -433,25 +443,20 @@ def _fit(self, X, y, parameter_iterator, **params): n_fits = len(out) n_folds = n_fits // n_param_points - scores = list() - cv_scores = list() - for start in range(0, n_fits, n_folds): - n_test_samples = 0 - mean_validation_score = 0 - these_points = list() - for this_score, clf_params, this_n_test_samples in \ - out[start:start + n_folds]: - these_points.append(this_score) - if self.iid: - this_score *= this_n_test_samples - mean_validation_score += this_score - n_test_samples += this_n_test_samples - if self.iid: - mean_validation_score /= float(n_test_samples) - scores.append((mean_validation_score, clf_params)) - cv_scores.append(these_points) - - cv_scores = np.asarray(cv_scores) + results = [ + [fold_results for clf_params, fold_results in out[start:start + n_folds]] + for start in range(0, n_fits, n_folds) + ] + result_keys = list(results[0][0].iterkeys()) # assume keys are same throughout + merged_results = {key: np.array([[fold_results[key] for fold_results in point] for point in results]) + for key in result_keys} + + scores = merged_results[Scorer.SCORE_KEY] + if self.iid: + scores = scores * merged_results['n_test_samples'] + scores = scores.sum(axis=1) / merged_results['n_test_samples'].sum(axis=1) + else: + scores = scores.sum(axis=1) # Note: we do not use max(out) to make ties deterministic even if # comparison on estimator instances is not deterministic @@ -465,7 +470,7 @@ def _fit(self, X, y, parameter_iterator, **params): else: best_score = np.inf - for score, params in scores: + for score, params in zip(scores, parameter_iterator): if ((score > best_score and greater_is_better) or (score < best_score and not greater_is_better)): best_score = score @@ -486,13 +491,9 @@ def _fit(self, X, y, parameter_iterator, **params): self._set_methods() # Store the computed scores - CVScoreTuple = namedtuple('CVScoreTuple', ('parameters', - 'mean_validation_score', - 'cv_validation_scores')) - self.cv_scores_ = [ - CVScoreTuple(clf_params, score, all_scores) - for clf_params, (score, _), all_scores - in zip(parameter_iterator, scores, cv_scores)] + self.cv_params_ = list(parameter_iterator) + self.cv_scores_ = scores + self.cv_folds_ = merged_results return self diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 0f01a0b8439d0..8e487495ebb71 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -20,7 +20,7 @@ from . import (r2_score, mean_squared_error, accuracy_score, f1_score, auc_score, average_precision_score, precision_score, - recall_score) + recall_score, precision_recall_fscore_support) from .cluster import adjusted_rand_score @@ -68,6 +68,8 @@ def __init__(self, score_func, greater_is_better=True, self.needs_threshold = needs_threshold self.kwargs = kwargs + SCORE_KEY = 'score' + def __repr__(self): kwargs_string = "".join([", %s=%s" % (str(k), str(v)) for k, v in self.kwargs.items()]) @@ -75,6 +77,33 @@ def __repr__(self): "%s%s)" % (self.score_func.__name__, self.greater_is_better, self.needs_threshold, kwargs_string)) + def store(self, result, estimator, X, y, prefix=''): + """Score X and y using the provided estimator and store it under the + key ``prefix`` + 'score' in result. + + Parameters + ---------- + result: dict-like + Where the result should be stored under key ``prefix`` + ``Scorer.SCORE_KEY``. + A custom ``Scorer`` may store other information under this prefix. + + estimator : object + Trained estimator to use for scoring. + If ``needs_threshold`` is True, estimator needs + to provide ``decision_function`` or ``predict_proba``. + Otherwise, estimator needs to provide ``predict``. + + X : array-like or sparse matrix + Test data that will be scored by the estimator. + + y : array-like + True prediction for X. + + prefix : string + The prefix of any keys to be stored in ``result``. + """ + result[prefix + self.SCORE_KEY] = self(estimator, X, y) + def __call__(self, estimator, X, y): """Score X and y using the provided estimator. @@ -111,6 +140,34 @@ def __call__(self, estimator, X, y): return self.score_func(y, y_pred, **self.kwargs) +class PRFScorer(Scorer): + """Scorer to optimise F score while also storing precision and recall. + """ + + def __init__(self, **kwargs): + if 'average' not in kwargs: + kwargs['average'] = 'weighted' + super(PRFScorer, self).__init__(precision_recall_fscore_support, **kwargs) + + PRECISION_KEY = 'precision' + RECALL_KEY = 'recall' + + def __repr__(self): + kwargs_string = "".join([", %s=%s" % (str(k), str(v)) + for k, v in self.kwargs.items()]) + return 'PRFScorer(%s)' % kwargs_string + + def store(self, result, estimator, X, y, prefix=''): + p, r, f, support = super(PRFScorer, self).__call__(estimator, X, y) + result[prefix + self.SCORE_KEY] = f + result[prefix + self.PRECISION_KEY] = p + result[prefix + self.RECALL_KEY] = r + + def __call__(self, estimator, X, y): + p, r, f, support = super(PRFScorer, self).__call__(estimator, X, y) + return f + + # Standard regression scores r2_scorer = Scorer(r2_score) mse_scorer = Scorer(mean_squared_error, greater_is_better=False) @@ -130,7 +187,7 @@ def __call__(self, estimator, X, y): ari_scorer = Scorer(adjusted_rand_score) SCORERS = dict(r2=r2_scorer, mse=mse_scorer, accuracy=accuracy_scorer, - f1=f1_scorer, roc_auc=auc_scorer, + f1=f1_scorer, prf1=PRFScorer(), roc_auc=auc_scorer, average_precision=average_precision_scorer, precision=precision_scorer, recall=recall_scorer, ari=ari_scorer) diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index ce3252022558b..19a4fe85c2b61 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -89,7 +89,7 @@ def test_grid_search(): assert_equal(grid_search.best_estimator_.foo_param, 2) for i, foo_i in enumerate([1, 2, 3]): - assert_true(grid_search.cv_scores_[i][0] + assert_true(grid_search.cv_params_[i] == {'foo_param': foo_i}) # Smoke test the score: grid_search.score(X, y) @@ -378,9 +378,8 @@ def test_grid_search_score_consistency(): grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score) grid_search.fit(X, y) cv = StratifiedKFold(n_folds=3, y=y) - for C, scores in zip(Cs, grid_search.cv_scores_): + for C, scores in zip(Cs, grid_search.cv_folds_['score']): clf.set_params(C=C) - scores = scores[2] # get the separate runs from grid scores i = 0 for train, test in cv: clf.fit(X[train], y[train]) From 9faeb891a2d63e0e16b32495bd3cc876e62160e8 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Mar 2013 00:01:12 +1100 Subject: [PATCH 03/12] ENH Use structured array for BaseSearchCV results --- sklearn/grid_search.py | 34 ++++++++++++++++--------------- sklearn/tests/test_grid_search.py | 12 +++++------ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index de49b98fc3e94..10bfa4ff11396 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -427,9 +427,13 @@ def _merge_result_dicts(self, result_dicts): -> {'score': np.array([[1, 2], [3, 4]])}""" # assume keys are same throughout result_keys = list(result_dicts[0][0].iterkeys()) - return {key: np.asarray([[fold_results[key] for fold_results in point] + res = {key: np.asarray([[fold_results[key] for fold_results in point] for point in result_dicts]) for key in result_keys} + np_res = np.zeros((len(result_dicts), len(result_dicts[0])), dtype=[(key, res[key].dtype) for key in result_keys]) + for key, val in res.iteritems(): + np_res[key] = val + return np_res def _fit(self, X, y, parameter_iterator, **params): """Actual fitting, performing the search over parameters.""" @@ -489,7 +493,11 @@ def _fit(self, X, y, parameter_iterator, **params): for start in range(0, n_fits, n_folds) ]) - grid_results = {'parameters': list(parameter_iterator)} + field_defs = [('parameters', 'object'), ('test_score', cv_results['test_score'].dtype)] + if self.compute_training_score: + field_defs.append(('train_score', cv_results['train_score'].dtype)) + grid_results = np.zeros(n_param_points, dtype=field_defs) + grid_results['parameters'] = list(parameter_iterator) grid_results['test_score'] = self._aggregate_scores( cv_results['test_score'], cv_results['test_n_samples']) if self.compute_training_score: @@ -622,9 +630,8 @@ class GridSearchCV(BaseSearchCV): Attributes ---------- - `grid_results_` : dict of string -> array or list - Each value is an array or list with elements for each parameter - combination in ``param_grid``. Elements for the following keys are: + `grid_results_` : structured array of shape [# param combinations] + For each parameter combination in ``param_grid`` includes these fields: * ``parameters``, dict of parameter settings * ``test_score``, the mean score over the @@ -632,10 +639,8 @@ class GridSearchCV(BaseSearchCV): * ``train_score``, the mean training score over the cross-validation folds, if ``compute_training_score`` - `fold_results_` : dict of string -> array - Each value is an array whose first two dimensions correspond to - parameter combinations and cross-validation folds, respectively. - Elements for the following keys are: + `fold_results_` : structured array of shape [# param combinations, # folds] + For each cross-validation fold includes these fields: * ``test_time``, the elapsed prediction and scoring time * ``train_time``, the elapsed training time @@ -815,9 +820,8 @@ class RandomizedSearchCV(BaseSearchCV): Attributes ---------- - `grid_results_` : dict of string -> array or list - Each value is an array or list with elements for each parameter - combination in ``param_grid``. Elements for the following keys are: + `grid_results_` : structured array of shape [# param combinations] + For each parameter combination in ``param_grid`` includes these fields: * ``parameters``, dict of parameter settings * ``test_score``, the mean score over the @@ -825,10 +829,8 @@ class RandomizedSearchCV(BaseSearchCV): * ``train_score``, the mean training score over the cross-validation folds, if ``compute_training_score`` - `fold_results_` : dict of string -> array - Each value is an array whose first two dimensions correspond to - parameter combinations and cross-validation folds, respectively. - Elements for the following keys are: + `fold_results_` : structured array of shape [# param combinations, # folds] + For each cross-validation fold includes these fields: * ``test_time``, the elapsed prediction and scoring time * ``train_time``, the elapsed training time diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index 6cc385685f204..8191f33aa4844 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -288,14 +288,12 @@ def test_grid_search_training_score(): clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, compute_training_score=True) cv.fit(X, y) - scores = zip(cv.grid_results_['train_score'], - cv.grid_results_['test_score']) - for i, (train_score, test_score) in enumerate(scores): - assert_greater(train_score, test_score) + for i, (grid_data, fold_data) in enumerate(zip(cv.grid_results_, cv.fold_results_)): + assert_greater(grid_data['train_score'], grid_data['test_score']) # hacky greater-equal - assert_greater(1 + 1e-10, train_score) - assert_greater(cv.fold_results_['train_time'][i, :].mean(), 0) - assert_greater(cv.fold_results_['test_time'][i, :].mean(), 0) + assert_greater(1 + 1e-10, grid_data['train_score']) + assert_greater(fold_data['train_time'].mean(), 0) + assert_greater(fold_data['test_time'].mean(), 0) class BrokenClassifier(BaseEstimator): From f5f3b90c58a6ac7bba837f3fb73e800e0141d052 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Mar 2013 00:29:42 +1100 Subject: [PATCH 04/12] STYLE line length and a TODO comment --- sklearn/grid_search.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 10bfa4ff11396..588f39352833f 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -411,7 +411,8 @@ def _set_methods(self): self.predict_proba = self.best_estimator_.predict_proba def _aggregate_scores(self, scores, n_samples): - """Take 2d arrays of scores and samples and calculate weighted means/sums of each row""" + """Take 2d arrays of scores and samples and calculate weighted + means/sums of each row""" if self.iid: scores = scores * n_samples scores = scores.sum(axis=1) / n_samples.sum(axis=1) @@ -430,7 +431,9 @@ def _merge_result_dicts(self, result_dicts): res = {key: np.asarray([[fold_results[key] for fold_results in point] for point in result_dicts]) for key in result_keys} - np_res = np.zeros((len(result_dicts), len(result_dicts[0])), dtype=[(key, res[key].dtype) for key in result_keys]) + # TODO: it would be nice if we need not duplicate this structure + np_res = np.zeros((len(result_dicts), len(result_dicts[0])), + dtype=[(key, res[key].dtype) for key in result_keys]) for key, val in res.iteritems(): np_res[key] = val return np_res From b728d66f7fc7554f26d98229ec3da226f302b43b Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Mar 2013 00:32:49 +1100 Subject: [PATCH 05/12] FIX use PRFScorer as 'f1' scorer --- sklearn/metrics/scorer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 3edce92965ff3..ff96bc8844f81 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -204,7 +204,7 @@ def __call__(self, estimator, X, y=None): # Standard Classification Scores accuracy_scorer = Scorer(accuracy_score) -f1_scorer = Scorer(f1_score) +f1_scorer = PRFScorer() # Score functions that need decision values auc_scorer = Scorer(auc_score, greater_is_better=True, needs_threshold=True) @@ -217,7 +217,7 @@ def __call__(self, estimator, X, y=None): ari_scorer = Scorer(adjusted_rand_score) SCORERS = dict(r2=r2_scorer, mse=mse_scorer, accuracy=accuracy_scorer, - f1=f1_scorer, prf1=PRFScorer(), roc_auc=auc_scorer, + f1=f1_scorer, roc_auc=auc_scorer, average_precision=average_precision_scorer, precision=precision_scorer, recall=recall_scorer, ari=ari_scorer) From b549457ab3c6dc795459bd8c09df4b34cc8f0e53 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Mar 2013 10:26:13 +1100 Subject: [PATCH 06/12] FIX return backwards compatibility to GridSearchCV.grid_scores_ --- sklearn/grid_search.py | 35 +++++++++++++++++++------------ sklearn/tests/test_grid_search.py | 24 +++++++++++++++++++++ 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 588f39352833f..9a74e218d4fce 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -386,6 +386,14 @@ def score(self, X, y=None): y_predicted = self.predict(X) return self.scorer(y, y_predicted) + @property + def grid_scores_(self): + warnings.warn("grid_scores_ is deprecated and will be removed in 0.15." + " Use grid_results_ and fold_results_ instead.", DeprecationWarning) + return zip(self.grid_results_['parameters'], + self.grid_results_['test_score'], + self.fold_results_['test_score']) + def _check_estimator(self): """Check that estimator can be fitted and score can be computed.""" if (not hasattr(self.estimator, 'fit') or @@ -658,16 +666,20 @@ class GridSearchCV(BaseSearchCV): which gave highest score (or smallest loss if specified) on the left out data. Available only if refit=True. - `best_score_` : float - score of best_estimator on the left out data. - `best_index_` : int The index of the best parameter setting into ``grid_results_`` and ``fold_results_`` data. + `best_score_` : float + score of best_estimator on the left out data. + `best_params_` : dict Parameter setting that gave the best results on the hold out data. + `grid_scores_` : list of tuples (deprecated) + Contains scores for all parameter combinations in ``param_grid``: + each tuple is (parameters, mean score, fold scores). + Notes ------ The parameters selected are those that maximize the score of the left out @@ -703,12 +715,6 @@ def __init__(self, estimator, param_grid, scoring=None, loss_func=None, self.param_grid = param_grid _check_param_grid(param_grid) - @property - def grid_scores_(self): - warnings.warn("grid_scores_ is deprecated and will be removed in 0.15." - " Use grid_results_ and fold_results_ instead.", DeprecationWarning) - return self.grid_results_['test_score'] - def fit(self, X, y=None, **params): """Run fit with all sets of parameters. @@ -820,7 +826,6 @@ class RandomizedSearchCV(BaseSearchCV): verbose : integer Controls the verbosity: the higher, the more messages. - Attributes ---------- `grid_results_` : structured array of shape [# param combinations] @@ -848,16 +853,20 @@ class RandomizedSearchCV(BaseSearchCV): which gave highest score (or smallest loss if specified) on the left out data. Available only if refit=True. - `best_score_` : float - score of best_estimator on the left out data. - `best_index_` : int The index of the best parameter setting into ``grid_results_`` and ``fold_results_`` data. + `best_score_` : float + score of best_estimator on the left out data. + `best_params_` : dict Parameter setting that gave the best results on the hold out data. + `grid_scores_` : list of tuples (deprecated) + Contains scores for all parameter combinations in ``param_grid``: + each tuple is (parameters, mean score, fold scores). + Notes ----- The parameters selected are those that maximize the score of the left out diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index 8191f33aa4844..fac4de19ba803 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -96,6 +96,30 @@ def test_grid_search(): grid_search.score(X, y) +def test_grid_scores(): + """Test that GridSearchCV.grid_scores_ is filled in the correct format""" + clf = MockClassifier() + grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3) + # make sure it selects the smallest parameter in case of ties + old_stdout = sys.stdout + sys.stdout = StringIO() + grid_search.fit(X, y) + sys.stdout = old_stdout + assert_equal(grid_search.best_estimator_.foo_param, 2) + + n_folds = 3 + with warnings.catch_warnings(record=True): + for i, foo_i in enumerate([1, 2, 3]): + assert_true(grid_search.grid_scores_[i][0] + == {'foo_param': foo_i}) + # mean score + assert_almost_equal(grid_search.grid_scores_[i][1], + (1. if foo_i > 1 else 0.)) + # all fold scores + assert_array_equal(grid_search.grid_scores_[i][2], + [1. if foo_i > 1 else 0.] * n_folds) + + def test_no_refit(): """Test that grid search can be used for model selection only""" clf = MockClassifier() From f9506a3995fcc310fb5260ba2d770e8ce54c20d1 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Mar 2013 10:37:26 +1100 Subject: [PATCH 07/12] ENH Reimplement best_params_ and best_score_ as properties Thus the attributes stored by BaseSearchCV._fit() are no longer redundant. Also: test for these attributes --- sklearn/grid_search.py | 14 ++++++++++++-- sklearn/tests/test_grid_search.py | 2 ++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 9a74e218d4fce..f8d1d6194b513 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -394,6 +394,18 @@ def grid_scores_(self): self.grid_results_['test_score'], self.fold_results_['test_score']) + @property + def best_score_(self): + if not hasattr(self, 'best_index_'): + raise AttributeError('Call fit() to calculate best_score_') + return self.grid_results_['test_score'][self.best_index_] + + @property + def best_params_(self): + if not hasattr(self, 'best_index_'): + raise AttributeError('Call fit() to calculate best_params_') + return self.grid_results_['parameters'][self.best_index_] + def _check_estimator(self): """Check that estimator can be fitted and score can be computed.""" if (not hasattr(self.estimator, 'fit') or @@ -536,8 +548,6 @@ def _fit(self, X, y, parameter_iterator, **params): best_index = i self.best_index_ = best_index - self.best_params_ = grid_results['parameters'][best_index] - self.best_score_ = best_score self.fold_results_ = cv_results self.grid_results_ = grid_results diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index fac4de19ba803..bee5bdf4e7c33 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -88,6 +88,8 @@ def test_grid_search(): grid_search.fit(X, y) sys.stdout = old_stdout assert_equal(grid_search.best_estimator_.foo_param, 2) + assert_equal(grid_search.best_params_, {'foo_param': 2}) + assert_equal(grid_search.best_score_, 1.) for i, foo_i in enumerate([1, 2, 3]): assert_true(grid_search.grid_results_['parameters'][i] From 19ea7eac0dcf73d47e7b1df23cf3855acad0111a Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Mar 2013 10:46:41 +1100 Subject: [PATCH 08/12] TEST add test for composite score output from GridSearchCV --- sklearn/tests/test_grid_search.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index bee5bdf4e7c33..13f0b50980437 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -25,7 +25,7 @@ ParameterSampler) from sklearn.svm import LinearSVC, SVC from sklearn.cluster import KMeans, MeanShift -from sklearn.metrics import f1_score +from sklearn.metrics import f1_score, precision_score, recall_score from sklearn.metrics import Scorer from sklearn.cross_validation import KFold, StratifiedKFold @@ -431,3 +431,24 @@ def test_grid_search_score_consistency(): clf.decision_function(X[test])) assert_almost_equal(correct_score, scores[i]) i += 1 + +def test_composite_scores(): + """Test that precision and recall are output when using f1""" + clf = LinearSVC(random_state=0) + X, y = make_blobs(random_state=0, centers=2) + Cs = [.1, 1, 10] + grid_search = GridSearchCV(clf, {'C': Cs}, scoring='f1', compute_training_score=True) + grid_search.fit(X, y) + cv = StratifiedKFold(n_folds=3, y=y) + for C, scores in zip(Cs, grid_search.fold_results_): + clf.set_params(C=C) + for fold, (train, test) in enumerate(cv): + clf.fit(X[train], y[train]) + for prefix, mask in [('test_', test), ('train_', train)]: + fold_scores = scores[fold] + correct_score = f1_score(y[mask], clf.predict(X[mask])) + correct_precision = precision_score(y[mask], clf.predict(X[mask])) + correct_recall = recall_score(y[mask], clf.predict(X[mask])) + assert_almost_equal(correct_score, fold_scores[prefix + 'score']) + assert_almost_equal(correct_precision, fold_scores[prefix + 'precision']) + assert_almost_equal(correct_recall, fold_scores[prefix + 'recall']) From a32d9366574a35796d56302dbb8aa05aa208f40e Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Mar 2013 14:53:24 +1100 Subject: [PATCH 09/12] ENH Export PRFScore from metrics --- sklearn/metrics/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index fc5f3c1ce9e01..6abbb38c600e0 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -29,7 +29,7 @@ from .metrics import zero_one from .metrics import zero_one_score -from .scorer import Scorer, EstimatorScorer, WrapScorer, SCORERS +from .scorer import Scorer, PRFScorer, EstimatorScorer, WrapScorer, SCORERS from . import cluster from .cluster import (adjusted_rand_score, @@ -83,5 +83,6 @@ 'silhouette_samples', 'v_measure_score', 'zero_one_loss', + 'PRFScorer', 'Scorer', 'SCORERS'] From f65f8603f555d7d4385d32e412048d60a7aa0a4e Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Mar 2013 18:22:24 +1100 Subject: [PATCH 10/12] FIX Use six's iteritems --- sklearn/grid_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index f8d1d6194b513..d9fed1407267d 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -20,7 +20,7 @@ from .base import MetaEstimatorMixin from .cross_validation import check_cv from .externals.joblib import Parallel, delayed, logger -from .externals.six import string_types +from .externals.six import string_types, iteritems from .utils import safe_mask, check_random_state from .utils.validation import _num_samples, check_arrays from .metrics import SCORERS, Scorer, EstimatorScorer, WrapScorer @@ -454,7 +454,7 @@ def _merge_result_dicts(self, result_dicts): # TODO: it would be nice if we need not duplicate this structure np_res = np.zeros((len(result_dicts), len(result_dicts[0])), dtype=[(key, res[key].dtype) for key in result_keys]) - for key, val in res.iteritems(): + for key, val in iteritems(res): np_res[key] = val return np_res From 9d6d4c2ee017e5b112ac223b0780cf4b88a82599 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Mar 2013 21:08:47 +1100 Subject: [PATCH 11/12] ENH Simplify the merging of results dicts into a structured array --- sklearn/grid_search.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index d9fed1407267d..9e3f70a2fdb2d 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -20,7 +20,7 @@ from .base import MetaEstimatorMixin from .cross_validation import check_cv from .externals.joblib import Parallel, delayed, logger -from .externals.six import string_types, iteritems +from .externals.six import string_types, iterkeys from .utils import safe_mask, check_random_state from .utils.validation import _num_samples, check_arrays from .metrics import SCORERS, Scorer, EstimatorScorer, WrapScorer @@ -447,16 +447,11 @@ def _merge_result_dicts(self, result_dicts): For example [[{'score': 1}, {'score': 2}], [{'score': 3}, {'score': 4}]] -> {'score': np.array([[1, 2], [3, 4]])}""" # assume keys are same throughout - result_keys = list(result_dicts[0][0].iterkeys()) - res = {key: np.asarray([[fold_results[key] for fold_results in point] - for point in result_dicts]) - for key in result_keys} - # TODO: it would be nice if we need not duplicate this structure - np_res = np.zeros((len(result_dicts), len(result_dicts[0])), - dtype=[(key, res[key].dtype) for key in result_keys]) - for key, val in iteritems(res): - np_res[key] = val - return np_res + result_keys = list(iterkeys(result_dicts[0][0])) + arrays = ([[fold_results[key] for fold_results in point] + for point in result_dicts] + for key in result_keys) + return np.rec.fromarrays(arrays, names=result_keys) def _fit(self, X, y, parameter_iterator, **params): """Actual fitting, performing the search over parameters.""" From d27adeb7a55a1a47bf5e7b8b9c27c775c748525f Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 14 Mar 2013 01:23:06 +1100 Subject: [PATCH 12/12] ENH/FIX/TST Use Scorer.calc_scores instead of store This replaces Scorer.store() Also: tests for new Scorer functionality and descendants, and fixes broken WrapScorer --- sklearn/grid_search.py | 20 ++-- sklearn/metrics/scorer.py | 112 +++++++++++++++----- sklearn/metrics/tests/test_score_objects.py | 64 ++++++++++- 3 files changed, 162 insertions(+), 34 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 9e3f70a2fdb2d..ab1fcb2e31e8f 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -264,7 +264,7 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, verbose, results = {'test_n_samples': _num_samples(X_test)} if scorer is None: scorer = EstimatorScorer(clf.score) - elif not hasattr(scorer, 'store'): + elif not hasattr(scorer, 'calc_scores'): scorer = WrapScorer(scorer) if y is not None: @@ -281,17 +281,19 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, verbose, clf.fit(*fit_args, **fit_params) results['train_time'] = time() - start start = time() - scorer.store(results, clf, *score_args, prefix='test_') + results.update(('test_' + name, score) + for name, score in scorer.calc_scores(clf, *score_args)) results['test_time'] = time() - start if compute_training_score: - scorer.store(results, clf, *fit_args, prefix='train_') + results.update(('train_' + name, score) + for name, score in scorer.calc_scores(clf, *fit_args)) try: test_score = results['test_score'] except KeyError: - raise ValueError("Scorer.store must set the key '%s' in results." - " Got %s instead." % (Scorer.SCORE_KEY, results)) + raise ValueError("Scorer.calc_scores must return a score named 'score'." + " Got %s instead." % (results)) if not isinstance(test_score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s)" " instead." % (str(test_score), type(test_score))) @@ -663,8 +665,8 @@ class GridSearchCV(BaseSearchCV): * ``test_score``, the score for this fold * ``train_score``, the training score for this fold * ``test_n_samples``, the number of samples in testing - * ``test_*``, other score information stored by the Scorer - * ``train_*``, other training score information stored by the Scorer + * ``test_*``, other scores from `scorer.calc_scores()` + * ``train_*``, other training scores from `scorer.calc_scores()` `best_estimator_` : estimator Estimator that was chosen by grid search, i.e. estimator @@ -850,8 +852,8 @@ class RandomizedSearchCV(BaseSearchCV): * ``test_score``, the score for this fold * ``train_score``, the training score for this fold * ``test_n_samples``, the number of samples in testing - * ``test_*``, other score information stored by the Scorer - * ``train_*``, other training score information stored by the Scorer + * ``test_*``, other scores from `scorer.calc_scores()` + * ``train_*``, other training scores from `scorer.calc_scores()` `best_estimator_` : estimator Estimator that was chosen by grid search, i.e. estimator diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index ff96bc8844f81..476bf779247aa 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -16,30 +16,29 @@ # Authors: Andreas Mueller # Liscence: Simplified BSD +from abc import ABCMeta, abstractmethod + import numpy as np -from . import (r2_score, mean_squared_error, accuracy_score, f1_score, +from . import (r2_score, mean_squared_error, accuracy_score, auc_score, average_precision_score, precision_score, recall_score, precision_recall_fscore_support) from .cluster import adjusted_rand_score class BaseScorer(object): - SCORE_KEY = 'score' + __metaclass__ = ABCMeta def __init__(self, greater_is_better=True): self.greater_is_better = greater_is_better - def store(self, result, estimator, X, y=None, prefix=''): - """Score X and y using the provided estimator and store it under the - key ``prefix`` + 'score' in result. + def calc_scores(self, estimator, X, y=None): + """Calculate one or more scores for X against y using the provided + estimator. While __call__ calculates a single score, this may return + multiple. Parameters ---------- - result: dict-like - Where the result should be stored under key ``prefix`` + ``Scorer.SCORE_KEY``. - A custom ``Scorer`` may store other information under this prefix. - estimator : object Trained estimator to use for scoring. If ``needs_threshold`` is True, estimator needs @@ -52,13 +51,39 @@ def store(self, result, estimator, X, y=None, prefix=''): y : array-like True prediction for X. - prefix : string - The prefix of any keys to be stored in ``result``. + Returns + ------- + scores : iterable of (name, score) pairs + Scores of the estimator's predictions of X with respect to y. + Names must be distinct, and exactly one name must be 'score', whose + score corresponds to the result of `__call__`. """ - result[prefix + self.SCORE_KEY] = self(estimator, X, y) + yield ('score', self(estimator, X, y)) + @abstractmethod def __call__(self, estimator, X, y=None): - raise NotImplementedError() + """Score X and y using the provided estimator. + + Parameters + ---------- + estimator : object + Trained estimator to use for scoring. + If ``needs_threshold`` is True, estimator needs + to provide ``decision_function`` or ``predict_proba``. + Otherwise, estimator needs to provide ``predict``. + + X : array-like or sparse matrix + Test data that will be scored by the estimator. + + y : array-like + True prediction for X. + + Returns + ------- + score : float + The score of estimator's prediction of X. + """ + pass class Scorer(BaseScorer): @@ -150,7 +175,13 @@ def __call__(self, estimator, X, y=None): class PRFScorer(Scorer): - """Scorer to optimise F score while also storing precision and recall. + """Scorer to optimise F score while also providing precision and recall. + + Parameters + ---------- + **kwargs : additional arguments + Additional parameters to be passed to + `metrics.precision_recall_fscore_support`. """ def __init__(self, **kwargs): @@ -158,19 +189,40 @@ def __init__(self, **kwargs): kwargs['average'] = 'weighted' super(PRFScorer, self).__init__(precision_recall_fscore_support, **kwargs) - PRECISION_KEY = 'precision' - RECALL_KEY = 'recall' - def __repr__(self): kwargs_string = "".join([", %s=%s" % (str(k), str(v)) for k, v in self.kwargs.items()]) return 'PRFScorer(%s)' % kwargs_string - def store(self, result, estimator, X, y, prefix=''): + def calc_scores(self, estimator, X, y): + """ + Calculates F score, precision and recall + + Parameters + ---------- + estimator : object + Trained estimator to use for scoring. + If ``needs_threshold`` is True, estimator needs + to provide ``decision_function`` or ``predict_proba``. + Otherwise, estimator needs to provide ``predict``. + + X : array-like or sparse matrix + Test data that will be scored by the estimator. + + y : array-like + True prediction for X. + + Returns + ------- + scores : list of (name, score) pairs + providing names 'score', 'precision' and 'recall' + """ p, r, f, support = super(PRFScorer, self).__call__(estimator, X, y) - result[prefix + self.SCORE_KEY] = f - result[prefix + self.PRECISION_KEY] = p - result[prefix + self.RECALL_KEY] = r + return [ + ('score', f), + ('precision', p), + ('recall', r), + ] def __call__(self, estimator, X, y): p, r, f, support = super(PRFScorer, self).__call__(estimator, X, y) @@ -178,10 +230,22 @@ def __call__(self, estimator, X, y): class WrapScorer(BaseScorer): - """Scores by passing the estimator and data to a given function""" + """Scores by passing the estimator and data to a given function + + Parameters + ---------- + score_fn : function with signature of `Scorer.__call__` + A function which returns a score given an estimator, instances and + ground truth if available. + + greater_is_better : boolean, default=True + Whether score_func is a score function (default), meaning high is good, + or a loss function, meaning low is good. + """ def __init__(self, score_fn, greater_is_better=True): - super(EstimatorScorer, self).__init__(greater_is_better) + super(WrapScorer, self).__init__(greater_is_better) + self.score_fn = score_fn def __call__(self, estimator, X, y=None): if y is None: @@ -217,7 +281,7 @@ def __call__(self, estimator, X, y=None): ari_scorer = Scorer(adjusted_rand_score) SCORERS = dict(r2=r2_scorer, mse=mse_scorer, accuracy=accuracy_scorer, - f1=f1_scorer, roc_auc=auc_scorer, + f1=PRFScorer(), roc_auc=auc_scorer, average_precision=average_precision_scorer, precision=precision_scorer, recall=recall_scorer, ari=ari_scorer) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 7777f15de1a7e..58c7edb6bd509 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -1,11 +1,13 @@ import pickle from sklearn.utils.testing import assert_almost_equal +from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_true from sklearn.metrics import f1_score, r2_score, auc_score, fbeta_score from sklearn.metrics.cluster import adjusted_rand_score -from sklearn.metrics import SCORERS, Scorer +from sklearn.metrics import SCORERS, Scorer, PRFScorer, WrapScorer, EstimatorScorer from sklearn.svm import LinearSVC from sklearn.cluster import KMeans from sklearn.linear_model import Ridge, LogisticRegression @@ -14,6 +16,8 @@ from sklearn.cross_validation import train_test_split, cross_val_score from sklearn.grid_search import GridSearchCV +# TODO: test scorers without ground truth + def test_classification_scores(): X, y = make_blobs(random_state=0) @@ -97,3 +101,61 @@ def test_raises_on_score_list(): grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average, param_grid={'max_depth': [1, 2]}) assert_raises(ValueError, grid_search.fit, X, y) + + +def test_calc_scores(): + """Test that the score returned by __call__ is named 'score' by calc_scores""" + scorer = SCORERS['roc_auc'] + X, y = make_blobs(random_state=0, centers=2) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = LinearSVC(random_state=0) + clf.fit(X_train, y_train) + score = scorer(clf, X_test, y_test) + scores = dict(scorer.calc_scores(clf, X_test, y_test)) + assert_true('score' in scores) + assert_equal(score, scores['score']) + + +def test_prf_scorer(): + X, y = make_blobs(random_state=0, centers=2) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = LinearSVC(random_state=0) + clf.fit(X_train, y_train) + + f1_scorer = PRFScorer() + f1_score = f1_scorer(clf, X_test, y_test) + f1_scores = dict(f1_scorer.calc_scores(clf, X_test, y_test)) + + f2_scorer = PRFScorer(beta=2.) + f2_score = f2_scorer(clf, X_test, y_test) + f2_scores = dict(f2_scorer.calc_scores(clf, X_test, y_test)) + + def F(p, r, beta): + return (1 + beta * beta) * p * r / (beta * beta * p + r) + + assert_equal(f1_score, f1_scores['score']) + assert_equal(f2_score, f2_scores['score']) + assert_almost_equal(f1_score, F(f1_scores['precision'], f1_scores['recall'], 1.)) + assert_almost_equal(f2_score, F(f2_scores['precision'], f2_scores['recall'], 2.)) + + +def test_estimator_scorer(): + scorer = EstimatorScorer() + X, y = make_blobs(random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = LinearSVC(random_state=0) + clf.fit(X_train, y_train) + score = scorer(clf, X_test, y_test) + assert_equal(clf.score(X_test, y_test), score) + assert_equal(score, dict(scorer.calc_scores(clf, X_test, y_test))['score']) + + +def test_wrap_scorer(): + scorer = WrapScorer(lambda clf, X, y: clf.score(X, y) * 100) + X, y = make_blobs(random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = LinearSVC(random_state=0) + clf.fit(X_train, y_train) + score = scorer(clf, X_test, y_test) + assert_equal(clf.score(X_test, y_test) * 100, score) + assert_equal(score, dict(scorer.calc_scores(clf, X_test, y_test))['score'])