From 73c96ceac53fc66c5e15245d1a8b80fff60850cf Mon Sep 17 00:00:00 2001
From: Mathieu Blondel <mathieu@mblondel.org>
Date: Mon, 29 Sep 2014 23:32:27 +0900
Subject: [PATCH] OOB-aware grid search.

---
 sklearn/grid_search.py            | 38 +++++++++++++++++++++++++++++++
 sklearn/metrics/scorer.py         |  2 +-
 sklearn/tests/test_grid_search.py | 30 ++++++++++++++++++++----
 3 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
index 5bc3fec1e318e..9f999d43ccb31 100644
--- a/sklearn/grid_search.py
+++ b/sklearn/grid_search.py
@@ -758,3 +758,41 @@ def fit(self, X, y=None):
                                           self.n_iter,
                                           random_state=self.random_state)
         return self._fit(X, y, sampled_params)
+
+
+class GridSearchOOB(BaseEstimator):
+
+    def __init__(self, estimator, param_grid, scoring):
+        self.estimator = estimator
+        self.param_grid = param_grid
+        self.scoring = scoring
+
+    def _score(self, y, y_pred):
+        s = self.scorer_
+        # Need this hack because the current scorer API recomputes the
+        # predictions: scoring(estimator, X, y)...
+        return s._score_func(y, y_pred, **s._kwargs) * s._sign
+
+    def fit(self, X, y):
+        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
+
+        best_score = None
+        best_estimator = None
+
+        for params in ParameterGrid(self.param_grid):
+            estimator = clone(self.estimator)
+            estimator.set_params(**params)
+            estimator.fit(X, y)
+
+            score = self._score(y, estimator.oob_prediction_)
+
+            if best_score is None or score > best_score:
+                best_estimator = estimator
+                best_score = score
+
+        self.best_estimator_ = best_estimator
+
+        return self
+
+    def predict(self, X):
+        return self.best_estimator_.predict(X)
diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 83e0918051992..d07733375fd2b 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -86,7 +86,7 @@ def __call__(self, estimator, X, y_true, sample_weight=None):
         else:
             return self._sign * self._score_func(y_true, y_pred,
                                                  **self._kwargs)
-                
+
 
 class _ProbaScorer(_BaseScorer):
     def __call__(self, clf, X, y, sample_weight=None):
diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py
index 7791993a44901..a566460cf350a 100644
--- a/sklearn/tests/test_grid_search.py
+++ b/sklearn/tests/test_grid_search.py
@@ -34,10 +34,17 @@
 from sklearn.datasets import make_classification
 from sklearn.datasets import make_blobs
 from sklearn.datasets import make_multilabel_classification
-from sklearn.grid_search import (GridSearchCV, RandomizedSearchCV,
-                                 ParameterGrid, ParameterSampler,
-                                 ChangedBehaviorWarning)
-from sklearn.svm import LinearSVC, SVC
+from sklearn.datasets import load_diabetes
+
+from sklearn.grid_search import GridSearchCV
+from sklearn.grid_search import GridSearchOOB
+from sklearn.grid_search import RandomizedSearchCV
+from sklearn.grid_search import ParameterGrid
+from sklearn.grid_search import ParameterSampler
+from sklearn.grid_search import ChangedBehaviorWarning
+from sklearn.grid_search import ParameterSampler
+
+from sklearn.svm import LinearSVC, SVC, SVR
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.cluster import KMeans, SpectralClustering
@@ -47,6 +54,7 @@
 from sklearn.cross_validation import KFold, StratifiedKFold
 from sklearn.preprocessing import Imputer
 from sklearn.pipeline import Pipeline
+from sklearn.ensemble import BaggingRegressor
 
 
 # Neither of the following two estimators inherit from BaseEstimator,
@@ -674,3 +682,17 @@ def test_grid_search_allows_nans():
         ('classifier', MockClassifier()),
     ])
     GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
+
+
+def test_grid_search_oob():
+    data = load_diabetes()
+    X, y = data.data, data.target
+    param_grid = {"base_estimator__C": [0.1, 1, 10],
+                  "base_estimator__gamma": [0.1, 1, 10]}
+    reg = BaggingRegressor(SVR(kernel="rbf"), n_estimators=50, oob_score=True,
+                           random_state=0)
+
+    gs = GridSearchOOB(reg, param_grid, scoring="r2")
+    gs.fit(X, y)
+    assert_equal(gs.best_estimator_.estimators_[0].C, 10)
+    assert_equal(gs.best_estimator_.estimators_[0].gamma, 10)