From 73c96ceac53fc66c5e15245d1a8b80fff60850cf Mon Sep 17 00:00:00 2001 From: Mathieu Blondel Date: Mon, 29 Sep 2014 23:32:27 +0900 Subject: [PATCH] OOB-aware grid search. --- sklearn/grid_search.py | 38 +++++++++++++++++++++++++++++++ sklearn/metrics/scorer.py | 2 +- sklearn/tests/test_grid_search.py | 30 ++++++++++++++++++++---- 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 5bc3fec1e318e..9f999d43ccb31 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -758,3 +758,41 @@ def fit(self, X, y=None): self.n_iter, random_state=self.random_state) return self._fit(X, y, sampled_params) + + +class GridSearchOOB(BaseEstimator): + + def __init__(self, estimator, param_grid, scoring): + self.estimator = estimator + self.param_grid = param_grid + self.scoring = scoring + + def _score(self, y, y_pred): + s = self.scorer_ + # Need this hack because the current scorer API recomputes the + # predictions: scoring(estimator, X, y)... + return s._score_func(y, y_pred, **s._kwargs) * s._sign + + def fit(self, X, y): + self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) + + best_score = None + best_estimator = None + + for params in ParameterGrid(self.param_grid): + estimator = clone(self.estimator) + estimator.set_params(**params) + estimator.fit(X, y) + + score = self._score(y, estimator.oob_prediction_) + + if best_score is None or score > best_score: + best_estimator = estimator + best_score = score + + self.best_estimator_ = best_estimator + + return self + + def predict(self, X): + return self.best_estimator_.predict(X) diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 83e0918051992..d07733375fd2b 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -86,7 +86,7 @@ def __call__(self, estimator, X, y_true, sample_weight=None): else: return self._sign * self._score_func(y_true, y_pred, **self._kwargs) - + class _ProbaScorer(_BaseScorer): def __call__(self, clf, X, y, sample_weight=None): diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index 7791993a44901..a566460cf350a 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -34,10 +34,17 @@ from sklearn.datasets import make_classification from sklearn.datasets import make_blobs from sklearn.datasets import make_multilabel_classification -from sklearn.grid_search import (GridSearchCV, RandomizedSearchCV, - ParameterGrid, ParameterSampler, - ChangedBehaviorWarning) -from sklearn.svm import LinearSVC, SVC +from sklearn.datasets import load_diabetes + +from sklearn.grid_search import GridSearchCV +from sklearn.grid_search import GridSearchOOB +from sklearn.grid_search import RandomizedSearchCV +from sklearn.grid_search import ParameterGrid +from sklearn.grid_search import ParameterSampler +from sklearn.grid_search import ChangedBehaviorWarning +from sklearn.grid_search import ParameterSampler + +from sklearn.svm import LinearSVC, SVC, SVR from sklearn.tree import DecisionTreeRegressor from sklearn.tree import DecisionTreeClassifier from sklearn.cluster import KMeans, SpectralClustering @@ -47,6 +54,7 @@ from sklearn.cross_validation import KFold, StratifiedKFold from sklearn.preprocessing import Imputer from sklearn.pipeline import Pipeline +from sklearn.ensemble import BaggingRegressor # Neither of the following two estimators inherit from BaseEstimator, @@ -674,3 +682,17 @@ def test_grid_search_allows_nans(): ('classifier', MockClassifier()), ]) GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y) + + +def test_grid_search_oob(): + data = load_diabetes() + X, y = data.data, data.target + param_grid = {"base_estimator__C": [0.1, 1, 10], + "base_estimator__gamma": [0.1, 1, 10]} + reg = BaggingRegressor(SVR(kernel="rbf"), n_estimators=50, oob_score=True, + random_state=0) + + gs = GridSearchOOB(reg, param_grid, scoring="r2") + gs.fit(X, y) + assert_equal(gs.best_estimator_.estimators_[0].C, 10) + assert_equal(gs.best_estimator_.estimators_[0].gamma, 10)