From 169cec9ea57efbebfe7271b152f4ba000c98b805 Mon Sep 17 00:00:00 2001 From: Raghav R V Date: Thu, 16 Jun 2016 00:04:44 +0200 Subject: [PATCH 1/5] ENH+TST Restructures grid_scores into dict of 1D (masked) numpy arrays. --- sklearn/model_selection/_search.py | 256 ++++++++---- sklearn/model_selection/tests/test_search.py | 394 +++++++++++++------ 2 files changed, 466 insertions(+), 184 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 72d17cd354d1b..3bdda72738556 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -3,6 +3,7 @@ parameters of an estimator. """ from __future__ import print_function +from __future__ import division # Author: Alexandre Gramfort , # Gael Varoquaux @@ -11,10 +12,11 @@ # License: BSD 3 clause from abc import ABCMeta, abstractmethod -from collections import Mapping, namedtuple, Sized +from collections import Mapping, namedtuple, Sized, defaultdict from functools import partial, reduce from itertools import product import operator +import warnings import numpy as np @@ -26,8 +28,9 @@ from ..externals import six from ..utils import check_random_state from ..utils.fixes import sp_version +from ..utils.fixes import rankdata from ..utils.random import sample_without_replacement -from ..utils.validation import _num_samples, indexable +from ..utils.validation import indexable, check_is_fitted from ..utils.metaestimators import if_delegate_has_method from ..metrics.scorer import check_scoring @@ -337,6 +340,7 @@ def _check_param_grid(param_grid): "list.") +# XXX Remove in 0.20 class _CVScoreTuple (namedtuple('_CVScoreTuple', ('parameters', 'mean_validation_score', @@ -513,16 +517,8 @@ def _fit(self, X, y, labels, parameter_iterable): cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) - n_samples = _num_samples(X) X, y, labels = indexable(X, y, labels) - - if y is not None: - if len(y) != n_samples: - raise ValueError('Target variable (y) has a different number ' - 'of samples (%i) than data (X: %i samples)' - % (len(y), n_samples)) n_splits = cv.get_n_splits(X, y, labels) - if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" @@ -530,7 +526,6 @@ def _fit(self, X, y, labels, parameter_iterable): n_candidates * n_splits)) base_estimator = clone(self.estimator) - pre_dispatch = self.pre_dispatch out = Parallel( @@ -543,47 +538,62 @@ def _fit(self, X, y, labels, parameter_iterable): for parameters in parameter_iterable for train, test in cv.split(X, y, labels)) - # Out is a list of triplet: score, estimator, n_test_samples - n_fits = len(out) - - scores = list() - grid_scores = list() - for grid_start in range(0, n_fits, n_splits): - n_test_samples = 0 - score = 0 - all_scores = [] - for this_score, this_n_test_samples, _, parameters in \ - out[grid_start:grid_start + n_splits]: - all_scores.append(this_score) - if self.iid: - this_score *= this_n_test_samples - n_test_samples += this_n_test_samples - score += this_score - if self.iid: - score /= float(n_test_samples) - else: - score /= float(n_splits) - scores.append((score, parameters)) - # TODO: shall we also store the test_fold_sizes? - grid_scores.append(_CVScoreTuple( - parameters, - score, - np.array(all_scores))) - # Store the computed scores - self.grid_scores_ = grid_scores - - # Find the best parameters by comparing on the mean validation score: - # note that `sorted` is deterministic in the way it breaks ties - best = sorted(grid_scores, key=lambda x: x.mean_validation_score, - reverse=True)[0] - self.best_params_ = best.parameters - self.best_score_ = best.mean_validation_score + test_scores, test_sample_counts, _, parameters = zip(*out) + + candidate_params = parameters[::n_splits] + n_candidates = len(candidate_params) + + test_scores = np.array(test_scores, + dtype=np.float64).reshape(n_candidates, + n_splits) + # NOTE test_sample counts (weights) remain the same for all candidates + test_sample_counts = np.array(test_sample_counts[:n_splits], + dtype=np.int) + + # Computed the (weighted) mean and std for all the candidates + weights = test_sample_counts if self.iid else None + means = np.average(test_scores, axis=1, weights=weights) + stds = np.sqrt(np.average((test_scores - means[:, np.newaxis]) ** 2, + axis=1, weights=weights)) + + results = dict() + for split_i in range(n_splits): + results["test_split%d_score" % split_i] = test_scores[:, split_i] + results["test_mean_score"] = means + results["test_std_score"] = stds + + ranks = np.asarray(rankdata(-means, method='min'), dtype=np.int32) + + best_index = np.flatnonzero(ranks == 1)[0] + best_parameters = candidate_params[best_index] + results["test_rank_score"] = ranks + + # Use one np.MaskedArray and mask all the places where the param is not + # applicable for that candidate. Use defaultdict as each candidate may + # not contain all the params + param_results = defaultdict(partial(np.ma.masked_all, (n_candidates,), + dtype=object)) + for cand_i, params in enumerate(candidate_params): + for name, value in params.items(): + # An all masked empty array gets created for the key + # `"param_%s" % name` at the first occurence of `name`. + # Setting the value at an index also unmasks that index + param_results["param_%s" % name][cand_i] = value + + results.update(param_results) + + # Store a list of param dicts at the key 'params' + results['params'] = candidate_params + + self.results_ = results + self.best_index_ = best_index + self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( - **best.parameters) + **best_parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: @@ -591,6 +601,38 @@ def _fit(self, X, y, labels, parameter_iterable): self.best_estimator_ = best_estimator return self + @property + def best_params_(self): + check_is_fitted(self, 'results_') + return self.results_['params'][self.best_index_] + + @property + def best_score_(self): + check_is_fitted(self, 'results_') + return self.results_['test_mean_score'][self.best_index_] + + @property + def grid_scores_(self): + warnings.warn( + "The grid_scores_ attribute is deprecated in favor of the" + " more elaborate results_ attribute." + " The grid_scores_ attribute will not be available from 0.20", + DeprecationWarning) + + check_is_fitted(self, 'results_') + grid_scores = list() + + for i, (params, mean, std) in enumerate(zip( + self.results_['params'], + self.results_['test_mean_score'], + self.results_['test_std_score'])): + scores = np.array(list(self.results_['test_split%d_score' % s][i] + for s in range(self.n_splits_)), + dtype=np.float64) + grid_scores.append(_CVScoreTuple(params, mean, scores)) + + return grid_scores + class GridSearchCV(BaseSearchCV): """Exhaustive search over specified parameter values for an estimator. @@ -704,19 +746,51 @@ class GridSearchCV(BaseSearchCV): fit_params={}, iid=..., n_jobs=1, param_grid=..., pre_dispatch=..., refit=..., scoring=..., verbose=...) - + >>> sorted(clf.results_.keys()) + ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + ['param_C', 'param_kernel', 'params', 'test_mean_score',... + 'test_rank_score', 'test_split0_score', 'test_split1_score',... + 'test_split2_score', 'test_std_score'] Attributes ---------- - grid_scores_ : list of named tuples - Contains scores for all parameter combinations in param_grid. - Each entry corresponds to one parameter setting. - Each named tuple has the attributes: - - * ``parameters``, a dict of parameter settings - * ``mean_validation_score``, the mean score over the - cross-validation folds - * ``cv_validation_scores``, the list of scores for each fold + results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. + + For instance the below given table + + +------------+-----------+------------+-----------------+---+---------+ + |param_kernel|param_gamma|param_degree|test_split0_score|...|...rank..| + +============+===========+============+=================+===+=========+ + | 'poly' | -- | 2 | 0.8 |...| 2 | + +------------+-----------+------------+-----------------+---+---------+ + | 'poly' | -- | 3 | 0.7 |...| 4 | + +------------+-----------+------------+-----------------+---+---------+ + | 'rbf' | 0.1 | -- | 0.8 |...| 3 | + +------------+-----------+------------+-----------------+---+---------+ + | 'rbf' | 0.2 | -- | 0.9 |...| 1 | + +------------+-----------+------------+-----------------+---+---------+ + + will be represented by a ``results_`` dict of:: + + { + 'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'], + mask = [False False False False]...) + 'param_gamma': masked_array(data = [-- -- 0.1 0.2], + mask = [ True True False False]...), + 'param_degree': masked_array(data = [2.0 3.0 -- --], + mask = [False False True True]...), + 'test_split0_score' : [0.8, 0.7, 0.8, 0.9], + 'test_split1_score' : [0.82, 0.5, 0.7, 0.78], + 'test_mean_score' : [0.81, 0.60, 0.75, 0.82], + 'test_std_score' : [0.02, 0.01, 0.03, 0.03], + 'test_rank_score' : [2, 4, 3, 1], + 'params' : [{'kernel': 'poly', 'degree': 2}, ...], + } + + NOTE that the key ``'params'`` is used to store a list of parameter + settings dict for all the parameter candidates. best_estimator_ : estimator Estimator that was chosen by the search, i.e. estimator @@ -729,10 +803,21 @@ class GridSearchCV(BaseSearchCV): best_params_ : dict Parameter setting that gave the best results on the hold out data. + best_index_ : int + The index (of the ``results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + scorer_ : function Scorer function used on the held out data to choose the best parameters for the model. + n_splits_ : int + The number of cross-validation splits (folds/iterations). + Notes ------ The parameters selected are those that maximize the score of the left out @@ -764,7 +849,6 @@ class GridSearchCV(BaseSearchCV): def __init__(self, estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise'): - super(GridSearchCV, self).__init__( estimator=estimator, scoring=scoring, fit_params=fit_params, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, @@ -904,15 +988,38 @@ class RandomizedSearchCV(BaseSearchCV): Attributes ---------- - grid_scores_ : list of named tuples - Contains scores for all parameter combinations in param_grid. - Each entry corresponds to one parameter setting. - Each named tuple has the attributes: - - * ``parameters``, a dict of parameter settings - * ``mean_validation_score``, the mean score over the - cross-validation folds - * ``cv_validation_scores``, the list of scores for each fold + results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. + + For instance the below given table + + +--------------+-------------+-------------------+---+---------------+ + | param_kernel | param_gamma | test_split0_score |...|test_rank_score| + +==============+=============+===================+===+===============+ + | 'rbf' | 0.1 | 0.8 |...| 2 | + +--------------+-------------+-------------------+---+---------------+ + | 'rbf' | 0.2 | 0.9 |...| 1 | + +--------------+-------------+-------------------+---+---------------+ + | 'rbf' | 0.3 | 0.7 |...| 1 | + +--------------+-------------+-------------------+---+---------------+ + + will be represented by a ``results_`` dict of:: + + { + 'param_kernel' : masked_array(data = ['rbf', rbf', 'rbf'], + mask = False), + 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False), + 'test_split0_score' : [0.8, 0.9, 0.7], + 'test_split1_score' : [0.82, 0.5, 0.7], + 'test_mean_score' : [0.81, 0.7, 0.7], + 'test_std_score' : [0.02, 0.2, 0.], + 'test_rank_score' : [3, 1, 1], + 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...], + } + + NOTE that the key ``'params'`` is used to store a list of parameter + settings dict for all the parameter candidates. best_estimator_ : estimator Estimator that was chosen by the search, i.e. estimator @@ -925,6 +1032,21 @@ class RandomizedSearchCV(BaseSearchCV): best_params_ : dict Parameter setting that gave the best results on the hold out data. + best_index_ : int + The index (of the ``results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + + scorer_ : function + Scorer function used on the held out data to choose the best + parameters for the model. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + Notes ----- The parameters selected are those that maximize the score of the held-out diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 21de129d0835d..f872d53f1b128 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -10,6 +10,7 @@ import numpy as np import scipy.sparse as sp +from sklearn.utils.fixes import in1d from sklearn.utils.fixes import sp_version from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_not_equal @@ -18,8 +19,8 @@ from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_false, assert_true from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal +from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import ignore_warnings from sklearn.utils.mocking import CheckingClassifier, MockDataFrame @@ -151,8 +152,8 @@ def test_grid_search(): sys.stdout = old_stdout assert_equal(grid_search.best_estimator_.foo_param, 2) - for i, foo_i in enumerate([1, 2, 3]): - assert_true(grid_search.grid_scores_[i][0] == {'foo_param': foo_i}) + assert_array_equal(grid_search.results_["param_foo_param"].data, [1, 2, 3]) + # Smoke test the score etc: grid_search.score(X, y) grid_search.predict_proba(X) @@ -245,30 +246,31 @@ def test_grid_search_labels(): gs.fit(X, y) -def test_trivial_grid_scores(): +def test_trivial_results_attr(): # Test search over a "grid" with only one point. # Non-regression test: grid_scores_ wouldn't be set by GridSearchCV. clf = MockClassifier() grid_search = GridSearchCV(clf, {'foo_param': [1]}) grid_search.fit(X, y) - assert_true(hasattr(grid_search, "grid_scores_")) + assert_true(hasattr(grid_search, "results_")) random_search = RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1) random_search.fit(X, y) - assert_true(hasattr(random_search, "grid_scores_")) + assert_true(hasattr(grid_search, "results_")) def test_no_refit(): - # Test that grid search can be used for model selection only + # Test that GSCV can be used for model selection alone without refitting clf = MockClassifier() grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False) grid_search.fit(X, y) - assert_true(hasattr(grid_search, "best_params_")) + assert_true(not hasattr(grid_search, "best_estimator_") and + hasattr(grid_search, "best_index_") and + hasattr(grid_search, "best_params_")) def test_grid_search_error(): - # Test that grid search will capture errors on data with different - # length + # Test that grid search will capture errors on data with different length X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() @@ -276,45 +278,6 @@ def test_grid_search_error(): assert_raises(ValueError, cv.fit, X_[:180], y_) -def test_grid_search_iid(): - # test the iid parameter - # noise-free simple 2d-data - X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0, - cluster_std=0.1, shuffle=False, n_samples=80) - # split dataset into two folds that are not iid - # first one contains data of all 4 blobs, second only from two. - mask = np.ones(X.shape[0], dtype=np.bool) - mask[np.where(y == 1)[0][::2]] = 0 - mask[np.where(y == 2)[0][::2]] = 0 - # this leads to perfect classification on one fold and a score of 1/3 on - # the other - svm = SVC(kernel='linear') - # create "cv" for splits - cv = [[mask, ~mask], [~mask, mask]] - # once with iid=True (default) - grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv) - grid_search.fit(X, y) - first = grid_search.grid_scores_[0] - assert_equal(first.parameters['C'], 1) - assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.]) - # for first split, 1/4 of dataset is in test, for second 3/4. - # take weighted average - assert_almost_equal(first.mean_validation_score, - 1 * 1. / 4. + 1. / 3. * 3. / 4.) - - # once with iid=False - grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv, - iid=False) - grid_search.fit(X, y) - first = grid_search.grid_scores_[0] - assert_equal(first.parameters['C'], 1) - # scores are the same as above - assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.]) - # averaged score is just mean of scores - assert_almost_equal(first.mean_validation_score, - np.mean(first.cv_validation_scores)) - - def test_grid_search_one_grid_point(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]} @@ -482,7 +445,7 @@ def test_gridsearch_nd(): clf = CheckingClassifier(check_X=check_X, check_y=check_y) grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}) grid_search.fit(X_4d, y_3d).score(X, y) - assert_true(hasattr(grid_search, "grid_scores_")) + assert_true(hasattr(grid_search, "results_")) def test_X_as_list(): @@ -494,7 +457,7 @@ def test_X_as_list(): cv = KFold(n_folds=3) grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv) grid_search.fit(X.tolist(), y).score(X, y) - assert_true(hasattr(grid_search, "grid_scores_")) + assert_true(hasattr(grid_search, "results_")) def test_y_as_list(): @@ -506,7 +469,7 @@ def test_y_as_list(): cv = KFold(n_folds=3) grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv) grid_search.fit(X, y.tolist()).score(X, y) - assert_true(hasattr(grid_search, "grid_scores_")) + assert_true(hasattr(grid_search, "results_")) @ignore_warnings @@ -532,7 +495,7 @@ def test_pandas_input(): grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}) grid_search.fit(X_df, y_ser).score(X_df, y_ser) grid_search.predict(X_df) - assert_true(hasattr(grid_search, "grid_scores_")) + assert_true(hasattr(grid_search, "results_")) def test_unsupervised_grid_search(): @@ -562,6 +525,7 @@ def custom_scoring(estimator, X): param_grid=dict(bandwidth=[.01, .1, 1]), scoring=custom_scoring) search.fit(X) + print(search.best_score_) assert_equal(search.best_params_['bandwidth'], .1) assert_equal(search.best_score_, 42) @@ -591,71 +555,261 @@ def test_param_sampler(): assert_equal([x for x in sampler], [x for x in sampler]) -def test_randomized_search_grid_scores(): +def check_results_array_types(results, param_keys, score_keys): + # Check if the search results' array are of correct types + assert_true(all(isinstance(results[param], np.ma.MaskedArray) + for param in param_keys)) + assert_true(all(results[key].dtype == object for key in param_keys)) + assert_false(any(isinstance(results[key], np.ma.MaskedArray) + for key in score_keys)) + assert_true(all(results[key].dtype == np.float64 + for key in score_keys if key != 'test_rank_score')) + assert_true(results['test_rank_score'].dtype == np.int32) + + +def check_results_keys(results, param_keys, score_keys, n_cand): + # Test the search.results_ contains all the required results + assert_array_equal(sorted(results.keys()), + sorted(param_keys + score_keys + ('params',))) + assert_true(all(results[key].shape == (n_cand,) + for key in param_keys + score_keys)) + + +def check_results_grid_scores_consistency(search): + # TODO Remove in 0.20 + results = search.results_ + res_scores = np.vstack(list([results["test_split%d_score" % i] + for i in range(search.n_splits_)])).T + res_means = results["test_mean_score"] + res_params = results["params"] + n_cand = len(res_params) + grid_scores = assert_warns(DeprecationWarning, getattr, + search, 'grid_scores_') + assert_equal(len(grid_scores), n_cand) + # Check consistency of the structure of grid_scores + for i in range(n_cand): + assert_equal(grid_scores[i].parameters, res_params[i]) + assert_array_equal(grid_scores[i].cv_validation_scores, + res_scores[i, :]) + assert_array_equal(grid_scores[i].mean_validation_score, res_means[i]) + + +def test_grid_search_results(): + X, y = make_classification(n_samples=50, n_features=4, + random_state=42) + + n_folds = 3 + n_grid_points = 6 + params = [dict(kernel=['rbf', ], C=[1, 10], gamma=[0.1, 1]), + dict(kernel=['poly', ], degree=[1, 2])] + grid_search = GridSearchCV(SVC(), cv=n_folds, iid=False, + param_grid=params) + grid_search.fit(X, y) + grid_search_iid = GridSearchCV(SVC(), cv=n_folds, iid=True, + param_grid=params) + grid_search_iid.fit(X, y) + + param_keys = ('param_C', 'param_degree', 'param_gamma', 'param_kernel') + score_keys = ('test_mean_score', 'test_rank_score', + 'test_split0_score', 'test_split1_score', + 'test_split2_score', 'test_std_score') + n_candidates = n_grid_points + + for search, iid in zip((grid_search, grid_search_iid), (False, True)): + assert_equal(iid, search.iid) + results = search.results_ + # Check results structure + check_results_array_types(results, param_keys, score_keys) + check_results_keys(results, param_keys, score_keys, n_candidates) + # Check masking + results = grid_search.results_ + n_candidates = len(grid_search.results_['params']) + assert_true(all((results['param_C'].mask[i] and + results['param_gamma'].mask[i] and + not results['param_degree'].mask[i]) + for i in range(n_candidates) + if results['param_kernel'][i] == 'linear')) + assert_true(all((not results['param_C'].mask[i] and + not results['param_gamma'].mask[i] and + results['param_degree'].mask[i]) + for i in range(n_candidates) + if results['param_kernel'][i] == 'rbf')) + check_results_grid_scores_consistency(search) + + +def test_random_search_results(): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) - # XXX: as of today (scipy 0.12) it's not possible to set the random seed - # of scipy.stats distributions: the assertions in this test should thus - # not depend on the randomization - params = dict(C=expon(scale=10), - gamma=expon(scale=0.1)) - n_cv_iter = 3 + # scipy.stats dists now supports `seed` but we still support scipy 0.12 + # which doesn't support the seed. Hence the assertions in the test for + # random_search alone should not depend on randomization. + n_folds = 3 n_search_iter = 30 - search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter, - param_distributions=params, iid=False) - search.fit(X, y) - assert_equal(len(search.grid_scores_), n_search_iter) - - # Check consistency of the structure of each cv_score item - for cv_score in search.grid_scores_: - assert_equal(len(cv_score.cv_validation_scores), n_cv_iter) - # Because we set iid to False, the mean_validation score is the - # mean of the fold mean scores instead of the aggregate sample-wise - # mean score - assert_almost_equal(np.mean(cv_score.cv_validation_scores), - cv_score.mean_validation_score) - assert_equal(list(sorted(cv_score.parameters.keys())), - list(sorted(params.keys()))) - - # Check the consistency with the best_score_ and best_params_ attributes - sorted_grid_scores = list(sorted(search.grid_scores_, - key=lambda x: x.mean_validation_score)) - best_score = sorted_grid_scores[-1].mean_validation_score - assert_equal(search.best_score_, best_score) - - tied_best_params = [s.parameters for s in sorted_grid_scores - if s.mean_validation_score == best_score] - assert_true(search.best_params_ in tied_best_params, - "best_params_={0} is not part of the" - " tied best models: {1}".format( - search.best_params_, tied_best_params)) - - -def test_grid_search_score_consistency(): + params = dict(C=expon(scale=10), gamma=expon(scale=0.1)) + random_search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_folds, + iid=False, param_distributions=params) + random_search.fit(X, y) + random_search_iid = RandomizedSearchCV(SVC(), n_iter=n_search_iter, + cv=n_folds, iid=True, + param_distributions=params) + random_search_iid.fit(X, y) + + param_keys = ('param_C', 'param_gamma') + score_keys = ('test_mean_score', 'test_rank_score', + 'test_split0_score', 'test_split1_score', + 'test_split2_score', 'test_std_score') + n_cand = n_search_iter + + for search, iid in zip((random_search, random_search_iid), (False, True)): + assert_equal(iid, search.iid) + results = search.results_ + # Check results structure + check_results_array_types(results, param_keys, score_keys) + check_results_keys(results, param_keys, score_keys, n_cand) + # For random_search, all the param array vals should be unmasked + assert_false(any(results['param_C'].mask) or + any(results['param_gamma'].mask)) + check_results_grid_scores_consistency(search) + + +def test_search_iid_param(): + # Test the IID parameter + # noise-free simple 2d-data + X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0, + cluster_std=0.1, shuffle=False, n_samples=80) + # split dataset into two folds that are not iid + # first one contains data of all 4 blobs, second only from two. + mask = np.ones(X.shape[0], dtype=np.bool) + mask[np.where(y == 1)[0][::2]] = 0 + mask[np.where(y == 2)[0][::2]] = 0 + # this leads to perfect classification on one fold and a score of 1/3 on + # the other + # create "cv" for splits + cv = [[mask, ~mask], [~mask, mask]] + # once with iid=True (default) + grid_search = GridSearchCV(SVC(), param_grid={'C': [1, 10]}, cv=cv) + random_search = RandomizedSearchCV(SVC(), n_iter=2, + param_distributions={'C': [1, 10]}, + cv=cv) + for search in (grid_search, random_search): + search.fit(X, y) + assert_true(search.iid) + + # Test the first candidate + cv_scores = np.array(list(search.results_['test_split%d_score' % s][0] + for s in range(search.n_splits_))) + mean = search.results_['test_mean_score'][0] + std = search.results_['test_std_score'][0] + + assert_equal(search.results_['param_C'][0], 1) + assert_array_almost_equal(cv_scores, [1, 1. / 3.]) + # for first split, 1/4 of dataset is in test, for second 3/4. + # take weighted average and weighted std + expected_mean = 1 * 1. / 4. + 1. / 3. * 3. / 4. + expected_std = np.sqrt(1. / 4 * (expected_mean - 1) ** 2 + + 3. / 4 * (expected_mean - 1. / 3.) ** 2) + assert_almost_equal(mean, expected_mean) + assert_almost_equal(std, expected_std) + + # once with iid=False + grid_search = GridSearchCV(SVC(), + param_grid={'C': [1, 10]}, + cv=cv, iid=False) + random_search = RandomizedSearchCV(SVC(), n_iter=2, + param_distributions={'C': [1, 10]}, + cv=cv, iid=False) + + for search in (grid_search, random_search): + search.fit(X, y) + assert_false(search.iid) + + cv_scores = np.array(list(search.results_['test_split%d_score' % s][0] + for s in range(search.n_splits_))) + mean = search.results_['test_mean_score'][0] + std = search.results_['test_std_score'][0] + assert_equal(search.results_['param_C'][0], 1) + # scores are the same as above + assert_array_almost_equal(cv_scores, [1, 1. / 3.]) + # Unweighted mean/std is used + assert_almost_equal(mean, np.mean(cv_scores)) + assert_almost_equal(std, np.std(cv_scores)) + + +def test_search_results_rank_tie_breaking(): + X, y = make_blobs(n_samples=50, random_state=42) + + # The two C values are close enough to give similar models + # which would result in a tie of their mean cv-scores + param_grid = {'C': [1, 1.001, 0.001]} + + grid_search = GridSearchCV(SVC(), param_grid=param_grid) + random_search = RandomizedSearchCV(SVC(), n_iter=3, + param_distributions=param_grid) + + for search in (grid_search, random_search): + search.fit(X, y) + results = search.results_ + # Check tie breaking strategy - + # Check that there is a tie in the mean scores between + # candidates 1 and 2 alone + assert_almost_equal(results['test_mean_score'][0], + results['test_mean_score'][1]) + try: + assert_almost_equal(results['test_mean_score'][1], + results['test_mean_score'][2]) + except AssertionError: + pass + # 'min' rank should be assigned to the tied candidates + assert_almost_equal(search.results_['test_rank_score'], [1, 1, 3]) + + +def test_search_results_none_param(): + X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1] + estimators = (DecisionTreeRegressor(), DecisionTreeClassifier()) + est_parameters = {"random_state": [0, None]} + cv = KFold(random_state=0) + + for est in estimators: + grid_search = GridSearchCV(est, est_parameters, cv=cv).fit(X, y) + assert_array_equal(grid_search.results_['param_random_state'], + [0, None]) + + +def test_grid_search_correct_score_results(): # test that correct scores are used + n_folds = 3 clf = LinearSVC(random_state=0) X, y = make_blobs(random_state=0, centers=2) Cs = [.1, 1, 10] for score in ['f1', 'roc_auc']: - grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score) - grid_search.fit(X, y) - cv = StratifiedKFold(n_folds=3) - for C, scores in zip(Cs, grid_search.grid_scores_): + grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_folds) + results = grid_search.fit(X, y).results_ + + # Test scorer names + result_keys = list(results.keys()) + expected_keys = (("test_mean_score", "test_rank_score") + + tuple("test_split%d_score" % cv_i + for cv_i in range(n_folds))) + assert_true(all(in1d(expected_keys, result_keys))) + + cv = StratifiedKFold(n_folds=n_folds) + n_splits = grid_search.n_splits_ + for candidate_i, C in enumerate(Cs): clf.set_params(C=C) - scores = scores[2] # get the separate runs from grid scores - i = 0 - for train, test in cv.split(X, y): + cv_scores = np.array(list(grid_search.results_['test_split%d_score' + % s][candidate_i] + for s in range(n_splits))) + for i, (train, test) in enumerate(cv.split(X, y)): clf.fit(X[train], y[train]) if score == "f1": correct_score = f1_score(y[test], clf.predict(X[test])) elif score == "roc_auc": dec = clf.decision_function(X[test]) correct_score = roc_auc_score(y[test], dec) - assert_almost_equal(correct_score, scores[i]) - i += 1 + assert_almost_equal(correct_score, cv_scores[i]) def test_pickle(): @@ -687,28 +841,32 @@ def test_grid_search_with_multioutput_data(): for est in estimators: grid_search = GridSearchCV(est, est_parameters, cv=cv) grid_search.fit(X, y) - for parameters, _, cv_validation_scores in grid_search.grid_scores_: - est.set_params(**parameters) + res_params = grid_search.results_['params'] + for cand_i in range(len(res_params)): + est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) - assert_almost_equal(correct_score, - cv_validation_scores[i]) + assert_almost_equal( + correct_score, + grid_search.results_['test_split%d_score' % i][cand_i]) # Test with a randomized search for est in estimators: random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) random_search.fit(X, y) - for parameters, _, cv_validation_scores in random_search.grid_scores_: - est.set_params(**parameters) + res_params = random_search.results_['params'] + for cand_i in range(len(res_params)): + est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) - assert_almost_equal(correct_score, - cv_validation_scores[i]) + assert_almost_equal( + correct_score, + random_search.results_['test_split%d_score' % i][cand_i]) def test_predict_proba_disabled(): @@ -763,22 +921,24 @@ def test_grid_search_failing_classifier(): # error in this test. gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy', refit=False, error_score=0.0) - assert_warns(FitFailedWarning, gs.fit, X, y) - + n_candidates = len(gs.results_['params']) # Ensure that grid scores were set to zero as required for those fits # that are expected to fail. - assert all(np.all(this_point.cv_validation_scores == 0.0) - for this_point in gs.grid_scores_ - if this_point.parameters['parameter'] == - FailingClassifier.FAILING_PARAMETER) + get_cand_scores = lambda i: np.array(list( + gs.results_['test_split%d_score' % s][i] for s in range(gs.n_splits_))) + assert all((np.all(get_cand_scores(cand_i) == 0.0) + for cand_i in range(n_candidates) + if gs.results_['param_parameter'][cand_i] == + FailingClassifier.FAILING_PARAMETER)) gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy', refit=False, error_score=float('nan')) assert_warns(FitFailedWarning, gs.fit, X, y) - assert all(np.all(np.isnan(this_point.cv_validation_scores)) - for this_point in gs.grid_scores_ - if this_point.parameters['parameter'] == + n_candidates = len(gs.results_['params']) + assert all(np.all(np.isnan(get_cand_scores(cand_i))) + for cand_i in range(n_candidates) + if gs.results_['param_parameter'][cand_i] == FailingClassifier.FAILING_PARAMETER) From c5e986194a5af5537c5411a6138c6a403ad65db3 Mon Sep 17 00:00:00 2001 From: Raghav R V Date: Thu, 16 Jun 2016 00:09:46 +0200 Subject: [PATCH 2/5] DOC update docs/examples to use results_ instead of grid_scores_ --- .../solutions/exercise_02_sentiment.py | 11 +++++--- .../text_analytics/working_with_text_data.rst | 17 ++++++------ .../model_selection/grid_search_digits.py | 6 +++-- examples/model_selection/randomized_search.py | 26 +++++++++---------- examples/svm/plot_rbf_parameters.py | 7 ++--- examples/svm/plot_svm_scale_c.py | 2 +- 6 files changed, 37 insertions(+), 32 deletions(-) diff --git a/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py b/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py index 85c4989786934..eab418fd0d8ba 100644 --- a/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py +++ b/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py @@ -53,9 +53,14 @@ grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train, y_train) - # TASK: print the cross-validated scores for the each parameters set - # explored by the grid search - print(grid_search.grid_scores_) + # TASK: print the mean and std for each candidate along with the parameter + # settings for all the candidates explored by grid search. + n_candidates = len(grid_search.results_['params']) + for i in range(n_candidates): + print(i, 'params - %s; mean - %0.2f; std - %0.2f' + % (grid_search.results_['params'][i], + grid_search.results_['test_mean_score'][i], + grid_search.results_['test_std_score'][i])) # TASK: Predict the outcome on the testing set and store it in a variable # named y_predicted diff --git a/doc/tutorial/text_analytics/working_with_text_data.rst b/doc/tutorial/text_analytics/working_with_text_data.rst index 75c333c641bbd..e6d33f3249bce 100644 --- a/doc/tutorial/text_analytics/working_with_text_data.rst +++ b/doc/tutorial/text_analytics/working_with_text_data.rst @@ -446,21 +446,22 @@ that we can use to ``predict``:: >>> twenty_train.target_names[gs_clf.predict(['God is love'])] 'soc.religion.christian' -but otherwise, it's a pretty large and clumsy object. We can, however, get the -optimal parameters out by inspecting the object's ``grid_scores_`` attribute, -which is a list of parameters/score pairs. To get the best scoring attributes, -we can do:: +The object's ``best_score_`` and ``best_params_`` attributes store the best +mean score and the parameters setting corresponding to that score:: - >>> best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1]) + >>> gs_clf.best_score_ + 0.900... >>> for param_name in sorted(parameters.keys()): - ... print("%s: %r" % (param_name, best_parameters[param_name])) + ... print("%s: %r" % (param_name, gs_clf.best_params_[param_name])) ... clf__alpha: 0.001 tfidf__use_idf: True vect__ngram_range: (1, 1) - >>> score # doctest: +ELLIPSIS - 0.900... +A more detailed summary of the search is available at ``gs_clf.results_``. + +The ``results_`` parameter can be easily imported into pandas as a +``DataFrame`` for further inspection. .. note: diff --git a/examples/model_selection/grid_search_digits.py b/examples/model_selection/grid_search_digits.py index 6b039629df2a5..40ed573247efd 100644 --- a/examples/model_selection/grid_search_digits.py +++ b/examples/model_selection/grid_search_digits.py @@ -60,9 +60,11 @@ print() print("Grid scores on development set:") print() - for params, mean_score, scores in clf.grid_scores_: + means = clf.results_['test_mean_score'] + stds = clf.results_['test_std_score'] + for i in range(len(clf.results_['params'])): print("%0.3f (+/-%0.03f) for %r" - % (mean_score, scores.std() * 2, params)) + % (means[i], stds[i] * 2, clf.results_['params'][i])) print() print("Detailed classification report:") diff --git a/examples/model_selection/randomized_search.py b/examples/model_selection/randomized_search.py index 85a16c6f52d55..e1f7c215ab653 100644 --- a/examples/model_selection/randomized_search.py +++ b/examples/model_selection/randomized_search.py @@ -23,7 +23,6 @@ import numpy as np from time import time -from operator import itemgetter from scipy.stats import randint as sp_randint from sklearn.model_selection import GridSearchCV @@ -40,15 +39,16 @@ # Utility function to report best scores -def report(grid_scores, n_top=3): - top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] - for i, score in enumerate(top_scores): - print("Model with rank: {0}".format(i + 1)) - print("Mean validation score: {0:.3f} (std: {1:.3f})".format( - score.mean_validation_score, - np.std(score.cv_validation_scores))) - print("Parameters: {0}".format(score.parameters)) - print("") +def report(results, n_top=3): + for i in range(1, n_top + 1): + candidates = np.flatnonzero(results['test_rank_score'] == i) + for candidate in candidates: + print("Model with rank: {0}".format(i)) + print("Mean validation score: {0:.3f} (std: {1:.3f})".format( + results['test_mean_score'][candidate], + results['test_std_score'][candidate])) + print("Parameters: {0}".format(results['params'][candidate])) + print("") # specify parameters and distributions to sample from @@ -68,7 +68,7 @@ def report(grid_scores, n_top=3): random_search.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) -report(random_search.grid_scores_) +report(random_search.results_) # use a full grid over all parameters param_grid = {"max_depth": [3, None], @@ -84,5 +84,5 @@ def report(grid_scores, n_top=3): grid_search.fit(X, y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." - % (time() - start, len(grid_search.grid_scores_))) -report(grid_search.grid_scores_) + % (time() - start, len(grid_search.results_['params']))) +report(grid_search.results_) diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index 27bbd94ee00c9..abbac81b18a0b 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -171,11 +171,8 @@ def __call__(self, value, clip=None): plt.yticks(()) plt.axis('tight') -# plot the scores of the grid -# grid_scores_ contains parameter settings and scores -# We extract just the scores -scores = [x[1] for x in grid.grid_scores_] -scores = np.array(scores).reshape(len(C_range), len(gamma_range)) +scores = grid.results_['test_mean_score'].reshape(len(C_range), + len(gamma_range)) # Draw heatmap of the validation accuracy as a function of gamma and C # diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py index ed92bc19dcada..996a0190e943a 100644 --- a/examples/svm/plot_svm_scale_c.py +++ b/examples/svm/plot_svm_scale_c.py @@ -131,7 +131,7 @@ cv=ShuffleSplit(train_size=train_size, n_iter=250, random_state=1)) grid.fit(X, y) - scores = [x[1] for x in grid.grid_scores_] + scores = grid.results_['test_mean_score'] scales = [(1, 'No scaling'), ((n_samples * train_size), '1/n_samples'), From 3a53ab0ae03fc98fde3636cf9f7d30bec0b7043b Mon Sep 17 00:00:00 2001 From: Raghav R V Date: Thu, 16 Jun 2016 00:10:21 +0200 Subject: [PATCH 3/5] MAINT define rankdata for older scipy versions --- sklearn/utils/fixes.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 70e1a6ce2849e..a10afa6d4d1e3 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -407,3 +407,39 @@ def array_equal(a1, a2): return bool(np.asarray(a1 == a2).all()) else: from numpy import array_equal + +if sp_version < (0, 13, 0): + def rankdata(a, method='average'): + if method not in ('average', 'min', 'max', 'dense', 'ordinal'): + raise ValueError('unknown method "{0}"'.format(method)) + + arr = np.ravel(np.asarray(a)) + algo = 'mergesort' if method == 'ordinal' else 'quicksort' + sorter = np.argsort(arr, kind=algo) + + inv = np.empty(sorter.size, dtype=np.intp) + inv[sorter] = np.arange(sorter.size, dtype=np.intp) + + if method == 'ordinal': + return inv + 1 + + arr = arr[sorter] + obs = np.r_[True, arr[1:] != arr[:-1]] + dense = obs.cumsum()[inv] + + if method == 'dense': + return dense + + # cumulative counts of each unique value + count = np.r_[np.nonzero(obs)[0], len(obs)] + + if method == 'max': + return count[dense] + + if method == 'min': + return count[dense - 1] + 1 + + # average method + return .5 * (count[dense] + count[dense - 1] + 1) +else: + from scipy.stats import rankdata From 18ac6a1b0a03c8221690f51a0fd3f936878414cc Mon Sep 17 00:00:00 2001 From: Raghav R V Date: Thu, 16 Jun 2016 00:10:31 +0200 Subject: [PATCH 4/5] DOC Add separate section for Model Selection Changes --- doc/whats_new.rst | 94 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 13 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index f4da898822b19..1497aa3807441 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -12,6 +12,57 @@ Version 0.18 Changelog --------- +.. _model_selection_changes: + +Model Selection Enhancements and API Changes +-------------------------------------------- + + - **The ``model_selection`` module** + + The new module :mod:`sklearn.model_selection`, which groups together the + functionalities of formerly :mod:`cross_validation`, :mod:`grid_search` and + :mod:`learning_curve`, introduces new possibilities such as nested + cross-validation and better manipulation of parameter searches with Pandas. + + Many things will stay the same but there are some key differences. Read + below to know more about the changes. + + - **Data-independent CV splitters enabling nested cross-validation** + + The new cross-validation splitters, defined in the + :mod:`sklearn.model_selection`, are no longer initialized with any + data-dependent parameters such as ``y``. Instead they expose a + :func:`split` method that takes in the data and yields a generator for the + different splits. + + This change makes it possible to use the cross-validation splitters to + perform nested cross-validation, facilitated by + :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` utilities. + + - **The enhanced `results_` attribute** + + The new ``results_`` attribute (of :class:`model_selection.GridSearchCV` + and :class:`model_selection.RandomizedSearchCV`) introduced in lieu of the + ``grid_scores_`` attribute is a dict of 1D arrays with elements in each + array corresponding to the parameter settings (i.e. search candidates). + + The ``results_`` dict can be easily imported into ``pandas`` as a + ``DataFrame`` for exploring the search results. + + The ``results_`` arrays include scores for each cross-validation split + (with keys such as ``test_split0_score``), as well as their mean + (``test_mean_score``) and standard deviation (``test_std_score``). + + The ranks for the search candidates (based on their mean + cross-validation score) is available at ``results_['test_rank_score']``. + + The parameter values for each parameter is stored separately as numpy + masked object arrays. The value, for that search candidate, is masked if + the corresponding parameter is not applicable. Additionally a list of all + the parameter dicts are stored at ``results_['params']``. + + New features ............ @@ -54,7 +105,7 @@ New features - Added ``algorithm="elkan"`` to :class:`cluster.KMeans` implementing Elkan's fast K-Means algorithm. By `Andreas Müller`_. - - Generalization of :func:`model_selection._validation.cross_val_predict`. + - Generalization of :func:`model_selection.cross_val_predict`. One can pass method names such as `predict_proba` to be used in the cross validation framework instead of the default `predict`. By `Ori Ziv`_ and `Sears Merritt`_. @@ -66,11 +117,10 @@ Enhancements and `Devashish Deshpande`_. - The cross-validation iterators are replaced by cross-validation splitters - available from :mod:`model_selection`. These expose a ``split`` method - that takes in the data and yields a generator for the different splits. - This change makes it possible to do nested cross-validation with ease, - facilitated by :class:`model_selection.GridSearchCV` and similar - utilities. (`#4294 `_) by `Raghav R V`_. + available from :mod:`sklearn.model_selection`. + Ref :ref:`model_selection_changes` for more information. + (`#4294 `_) by + `Raghav R V`_. - The random forest, extra trees and decision tree estimators now has a method ``decision_path`` which returns the decision path of samples in @@ -144,6 +194,14 @@ Enhancements - The :func: `ignore_warnings` now accept a category argument to ignore only the warnings of a specified type. By `Thierry Guillemot`_. + - The new ``results_`` attribute of :class:`model_selection.GridSearchCV` + (and :class:`model_selection.RandomizedSearchCV`) can be easily imported + into pandas as a ``DataFrame``. Ref :ref:`model_selection_changes` for + more information. + (`#6697 `_) by + `Raghav R V`_. + + Bug fixes ......... @@ -212,10 +270,12 @@ Bug fixes API changes summary ------------------- - - The :mod:`cross_validation`, :mod:`grid_search` and :mod:`learning_curve` - have been deprecated and the classes and functions have been reorganized into - the :mod:`model_selection` module. - (`#4294 `_) by `Raghav R V`_. + - The :mod:`sklearn.cross_validation`, :mod:`sklearn.grid_search` and + :mod:`sklearn.learning_curve` have been deprecated and the classes and + functions have been reorganized into the :mod:`model_selection` module. + Ref :ref:`model_selection_changes` for more information. + (`#4294 `_) by + `Raghav R V`_. - ``residual_metric`` has been deprecated in :class:`linear_model.RANSACRegressor`. Use ``loss`` instead. By `Manoj Kumar`_. @@ -224,12 +284,20 @@ API changes summary :class:`isotonic.IsotonicRegression`. By `Jonathan Arfa`_. - The old :class:`GMM` is deprecated in favor of the new - :class:`GaussianMixture`. The new class compute the Gaussian mixture - faster than before and some of computationnal problems have been solved. + :class:`GaussianMixture`. The new class computes the Gaussian mixture + faster than before and some of computational problems have been solved. By `Wei Xue`_ and `Thierry Guillemot`_. + - The ``grid_scores_`` attribute of :class:`model_selection.GridSearchCV` + and :class:`model_selection.RandomizedSearchCV` is deprecated in favor of + the attribute ``results_``. + Ref :ref:`model_selection_changes` for more information. + (`#6697 `_) by + `Raghav R V`_. +.. currentmodule:: sklearn + .. _changes_0_17_1: Version 0.17.1 @@ -4088,7 +4156,7 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Matteo Visconti di Oleggio Castello: http://www.mvdoc.me -.. _Raghav R V: https://github.com/rvraghav93 +.. _Raghav R V: https://github.com/raghavrv .. _Trevor Stephens: http://trevorstephens.com/ From e8e2a9c833a674d084972ca71753fc3e58920401 Mon Sep 17 00:00:00 2001 From: Raghav R V Date: Thu, 16 Jun 2016 01:07:35 +0200 Subject: [PATCH 5/5] FIX Remove scaffolding print line. --- sklearn/model_selection/tests/test_search.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index f872d53f1b128..c3365bd3a7e60 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -525,7 +525,6 @@ def custom_scoring(estimator, X): param_grid=dict(bandwidth=[.01, .1, 1]), scoring=custom_scoring) search.fit(X) - print(search.best_score_) assert_equal(search.best_params_['bandwidth'], .1) assert_equal(search.best_score_, 42)