From 5bebb820771df5e40711f78807ff0324cd2d1545 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sun, 3 Mar 2013 19:47:21 +0100 Subject: [PATCH 1/2] FIX for iid weighting in grid-search --- sklearn/grid_search.py | 16 ++++++++------ sklearn/tests/test_grid_search.py | 36 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 5d255eeb50f32..77ebe514d71c8 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -432,20 +432,22 @@ def _fit(self, X, y, parameter_iterator, **params): scores = list() cv_scores = list() - for start in range(0, n_fits, n_folds): + for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 - mean_validation_score = 0 + score = 0 these_points = list() for this_score, clf_params, this_n_test_samples in \ - out[start:start + n_folds]: + out[grid_start:grid_start + n_folds]: these_points.append(this_score) if self.iid: this_score *= this_n_test_samples - mean_validation_score += this_score - n_test_samples += this_n_test_samples + n_test_samples += this_n_test_samples + score += this_score if self.iid: - mean_validation_score /= float(n_test_samples) - scores.append((mean_validation_score, clf_params)) + score /= float(n_test_samples) + else: + score /= float(n_folds) + scores.append((score, clf_params)) cv_scores.append(these_points) cv_scores = np.asarray(cv_scores) diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index ce3252022558b..84680d0037e64 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -15,6 +15,7 @@ from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_almost_equal +from sklearn.utils.testing import assert_array_almost_equal from scipy.stats import distributions @@ -113,6 +114,41 @@ def test_grid_search_error(): assert_raises(ValueError, cv.fit, X_[:180], y_) +def test_grid_search_iid(): + # test the iid parameter + # noise-free simple 2d-data + X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0, + cluster_std=0.1, shuffle=False, n_samples=80) + # split dataset into two folds that are not iid + # first on contains data of all 4 blobs, second only from two. + mask = np.ones(X.shape[0], dtype=np.bool) + mask[np.where(y == 1)[0][::2]] = 0 + mask[np.where(y == 2)[0][::2]] = 0 + # this leads to perfect classification on one fold and a score of 1/3 on + # the other + svm = SVC(kernel='linear') + # create "cv" for splits + cv = [[mask, ~mask], [~mask, mask]] + # once with iid=True (default) + grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv) + grid_search.fit(X, y) + _, average_score, scores = grid_search.grid_scores_[0] + assert_array_almost_equal(scores, [1, 1. / 3.]) + # for first split, 1/4 of dataset is in test, for second 3/4. + # take weighted average + assert_almost_equal(average_score, 1 * 1. / 4. + 1. / 3. * 3. / 4.) + + # once with iid=False (default) + grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv, + iid=False) + grid_search.fit(X, y) + _, average_score, scores = grid_search.grid_scores_[0] + # scores are the same as above + assert_array_almost_equal(scores, [1, 1. / 3.]) + # averaged score is just mean of scores + assert_almost_equal(average_score, np.mean(scores)) + + def test_grid_search_one_grid_point(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]} From c3eeccd3e4a207d72775e75d4be357894aa89e76 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sun, 3 Mar 2013 20:40:30 +0100 Subject: [PATCH 2/2] DOC FIX finite precision --- doc/tutorial/statistical_inference/model_selection.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst index 65ae1c431466e..b069c31d5ec69 100644 --- a/doc/tutorial/statistical_inference/model_selection.rst +++ b/doc/tutorial/statistical_inference/model_selection.rst @@ -145,8 +145,8 @@ estimator during the construction and exposes an estimator API:: ... n_jobs=-1) >>> clf.fit(X_digits[:1000], y_digits[:1000]) # doctest: +ELLIPSIS GridSearchCV(cv=None,... - >>> clf.best_score_ - 0.98899999999999999 + >>> clf.best_score_ # doctest: +ELLIPSIS + 0.9889... >>> clf.best_estimator_.gamma 9.9999999999999995e-07