From 5bebb820771df5e40711f78807ff0324cd2d1545 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@ais.uni-bonn.de>
Date: Sun, 3 Mar 2013 19:47:21 +0100
Subject: [PATCH 1/2] FIX for iid weighting in grid-search

---
 sklearn/grid_search.py            | 16 ++++++++------
 sklearn/tests/test_grid_search.py | 36 +++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
index 5d255eeb50f32..77ebe514d71c8 100644
--- a/sklearn/grid_search.py
+++ b/sklearn/grid_search.py
@@ -432,20 +432,22 @@ def _fit(self, X, y, parameter_iterator, **params):
 
         scores = list()
         cv_scores = list()
-        for start in range(0, n_fits, n_folds):
+        for grid_start in range(0, n_fits, n_folds):
             n_test_samples = 0
-            mean_validation_score = 0
+            score = 0
             these_points = list()
             for this_score, clf_params, this_n_test_samples in \
-                    out[start:start + n_folds]:
+                    out[grid_start:grid_start + n_folds]:
                 these_points.append(this_score)
                 if self.iid:
                     this_score *= this_n_test_samples
-                mean_validation_score += this_score
-                n_test_samples += this_n_test_samples
+                    n_test_samples += this_n_test_samples
+                score += this_score
             if self.iid:
-                mean_validation_score /= float(n_test_samples)
-            scores.append((mean_validation_score, clf_params))
+                score /= float(n_test_samples)
+            else:
+                score /= float(n_folds)
+            scores.append((score, clf_params))
             cv_scores.append(these_points)
 
         cv_scores = np.asarray(cv_scores)
diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py
index ce3252022558b..84680d0037e64 100644
--- a/sklearn/tests/test_grid_search.py
+++ b/sklearn/tests/test_grid_search.py
@@ -15,6 +15,7 @@
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_almost_equal
+from sklearn.utils.testing import assert_array_almost_equal
 
 from scipy.stats import distributions
 
@@ -113,6 +114,41 @@ def test_grid_search_error():
     assert_raises(ValueError, cv.fit, X_[:180], y_)
 
 
+def test_grid_search_iid():
+    # test the iid parameter
+    # noise-free simple 2d-data
+    X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0,
+                      cluster_std=0.1, shuffle=False, n_samples=80)
+    # split dataset into two folds that are not iid
+    # first on contains data of all 4 blobs, second only from two.
+    mask = np.ones(X.shape[0], dtype=np.bool)
+    mask[np.where(y == 1)[0][::2]] = 0
+    mask[np.where(y == 2)[0][::2]] = 0
+    # this leads to perfect classification on one fold and a score of 1/3 on
+    # the other
+    svm = SVC(kernel='linear')
+    # create "cv" for splits
+    cv = [[mask, ~mask], [~mask, mask]]
+    # once with iid=True (default)
+    grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv)
+    grid_search.fit(X, y)
+    _, average_score, scores = grid_search.grid_scores_[0]
+    assert_array_almost_equal(scores, [1, 1. / 3.])
+    # for first split, 1/4 of dataset is in test, for second 3/4.
+    # take weighted average
+    assert_almost_equal(average_score, 1 * 1. / 4. + 1. / 3. * 3. / 4.)
+
+    # once with iid=False (default)
+    grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv,
+                               iid=False)
+    grid_search.fit(X, y)
+    _, average_score, scores = grid_search.grid_scores_[0]
+    # scores are the same as above
+    assert_array_almost_equal(scores, [1, 1. / 3.])
+    # averaged score is just mean of scores
+    assert_almost_equal(average_score, np.mean(scores))
+
+
 def test_grid_search_one_grid_point():
     X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
     param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}

From c3eeccd3e4a207d72775e75d4be357894aa89e76 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@ais.uni-bonn.de>
Date: Sun, 3 Mar 2013 20:40:30 +0100
Subject: [PATCH 2/2] DOC FIX finite precision

---
 doc/tutorial/statistical_inference/model_selection.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst
index 65ae1c431466e..b069c31d5ec69 100644
--- a/doc/tutorial/statistical_inference/model_selection.rst
+++ b/doc/tutorial/statistical_inference/model_selection.rst
@@ -145,8 +145,8 @@ estimator during the construction and exposes an estimator API::
     ...                    n_jobs=-1)
     >>> clf.fit(X_digits[:1000], y_digits[:1000]) # doctest: +ELLIPSIS
     GridSearchCV(cv=None,...
-    >>> clf.best_score_
-    0.98899999999999999
+    >>> clf.best_score_   # doctest: +ELLIPSIS
+    0.9889...
     >>> clf.best_estimator_.gamma
     9.9999999999999995e-07