From 0d1401fce929fcea169bfa394b8bca1476032a1c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 12 Feb 2019 10:00:21 -0500 Subject: [PATCH 01/89] More flexible grid search interface --- sklearn/model_selection/_search.py | 31 ++++++++++---------- sklearn/model_selection/tests/test_search.py | 10 +++---- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 0a09e7c5fa0be..0ad3195b06ac5 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -547,7 +547,7 @@ def classes_(self): self._check_is_fitted("classes_") return self.best_estimator_.classes_ - def _run_search(self, evaluate_candidates): + def _run_search(self, evaluate_candidates, X, y): """Repeatedly calls `evaluate_candidates` to conduct a search. This method, implemented in sub-classes, makes it possible to @@ -572,12 +572,12 @@ def _run_search(self, evaluate_candidates): :: - def _run_search(self, evaluate_candidates): + def _run_search(self, evaluate_candidates, X, y): 'Try C=0.1 only if C=1 is better than C=10' - all_results = evaluate_candidates([{'C': 1}, {'C': 10}]) + all_results = evaluate_candidates([{'C': 1}, {'C': 10}], X, y) score = all_results['mean_test_score'] if score[0] < score[1]: - evaluate_candidates([{'C': 0.1}]) + evaluate_candidates([{'C': 0.1}], X, y) """ raise NotImplementedError("_run_search not implemented.") @@ -643,12 +643,12 @@ def fit(self, X, y=None, groups=None, **fit_params): return_parameters=False, error_score=self.error_score, verbose=self.verbose) - results_container = [{}] + results = {} with parallel: all_candidate_params = [] all_out = [] - def evaluate_candidates(candidate_params): + def evaluate_candidates(candidate_params, X, y): candidate_params = list(candidate_params) n_candidates = len(candidate_params) @@ -680,15 +680,14 @@ def evaluate_candidates(candidate_params): all_candidate_params.extend(candidate_params) all_out.extend(out) - # XXX: When we drop Python 2 support, we can use nonlocal - # instead of results_container - results_container[0] = self._format_results( + nonlocal results + results = self._format_results( all_candidate_params, scorers, n_splits, all_out) - return results_container[0] - self._run_search(evaluate_candidates) + return self._format_results( + candidate_params, scorers, n_splits, out) - results = results_container[0] + self._run_search(evaluate_candidates, X, y) # For multi-metric evaluation, store the best_index_, best_params_ and # best_score_ iff refit is one of the scorer names @@ -1145,9 +1144,9 @@ def __init__(self, estimator, param_grid, scoring=None, self.param_grid = param_grid _check_param_grid(param_grid) - def _run_search(self, evaluate_candidates): + def _run_search(self, evaluate_candidates, X, y): """Search all candidates in param_grid""" - evaluate_candidates(ParameterGrid(self.param_grid)) + evaluate_candidates(ParameterGrid(self.param_grid), X, y) class RandomizedSearchCV(BaseSearchCV): @@ -1462,8 +1461,8 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score) - def _run_search(self, evaluate_candidates): + def _run_search(self, evaluate_candidates, X, y): """Search n_iter candidates from param_distributions""" evaluate_candidates(ParameterSampler( self.param_distributions, self.n_iter, - random_state=self.random_state)) + random_state=self.random_state), X, y) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 521ba92289434..be544b55f47e7 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1659,13 +1659,11 @@ class CustomSearchCV(BaseSearchCV): def __init__(self, estimator, **kwargs): super().__init__(estimator, **kwargs) - def _run_search(self, evaluate): - results = evaluate([{'max_depth': 1}, {'max_depth': 2}]) + def _run_search(self, evaluate, X, y): + results = evaluate([{'max_depth': 1}, {'max_depth': 2}], X, y) check_results(results, fit_grid({'max_depth': [1, 2]})) - results = evaluate([{'min_samples_split': 5}, - {'min_samples_split': 10}]) - check_results(results, fit_grid([{'max_depth': [1, 2]}, - {'min_samples_split': [5, 10]}])) + evaluate([{'min_samples_split': 5}, + {'min_samples_split': 10}], X, y) # Using regressor to make sure each score differs clf = DecisionTreeRegressor(random_state=0) From 80963f35c6a4416a8e68ff9ca2099844cbeac20c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 13 Feb 2019 17:13:04 -0500 Subject: [PATCH 02/89] added info dict parameter --- sklearn/model_selection/_search.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 0ad3195b06ac5..353938a1efa0a 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -647,8 +647,9 @@ def fit(self, X, y=None, groups=None, **fit_params): with parallel: all_candidate_params = [] all_out = [] + all_info = defaultdict(list) - def evaluate_candidates(candidate_params, X, y): + def evaluate_candidates(candidate_params, X, y, info=None): candidate_params = list(candidate_params) n_candidates = len(candidate_params) @@ -679,13 +680,15 @@ def evaluate_candidates(candidate_params, X, y): all_candidate_params.extend(candidate_params) all_out.extend(out) + if info is not None: + for key, value in info.items(): + all_info[key].extend(value) nonlocal results results = self._format_results( - all_candidate_params, scorers, n_splits, all_out) + all_candidate_params, scorers, n_splits, all_out, all_info) - return self._format_results( - candidate_params, scorers, n_splits, out) + return results self._run_search(evaluate_candidates, X, y) @@ -727,7 +730,7 @@ def evaluate_candidates(candidate_params, X, y): return self - def _format_results(self, candidate_params, scorers, n_splits, out): + def _format_results(self, candidate_params, scorers, n_splits, out, info): n_candidates = len(candidate_params) # if one choose to see train score, "out" will contain train score info @@ -744,7 +747,7 @@ def _format_results(self, candidate_params, scorers, n_splits, out): if self.return_train_score: train_scores = _aggregate_score_dicts(train_score_dicts) - results = {} + results = dict(info) def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" From 326fe39cfa802f69b68588a30eb06a677f06e80a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 13 Feb 2019 17:17:02 -0500 Subject: [PATCH 03/89] Put back removed test --- sklearn/model_selection/tests/test_search.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index be544b55f47e7..fa65a3be95f55 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1662,8 +1662,10 @@ def __init__(self, estimator, **kwargs): def _run_search(self, evaluate, X, y): results = evaluate([{'max_depth': 1}, {'max_depth': 2}], X, y) check_results(results, fit_grid({'max_depth': [1, 2]})) - evaluate([{'min_samples_split': 5}, - {'min_samples_split': 10}], X, y) + results = evaluate([{'min_samples_split': 5}, + {'min_samples_split': 10}], X, y) + check_results(results, fit_grid([{'max_depth': [1, 2]}, + {'min_samples_split': [5, 10]}])) # Using regressor to make sure each score differs clf = DecisionTreeRegressor(random_state=0) From bae0d950e23d29d4bf3fccde447991d2e5f4ce1d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Feb 2019 10:34:16 -0500 Subject: [PATCH 04/89] renamed info into more_results --- sklearn/model_selection/_search.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 353938a1efa0a..5a7558012a784 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -647,9 +647,9 @@ def fit(self, X, y=None, groups=None, **fit_params): with parallel: all_candidate_params = [] all_out = [] - all_info = defaultdict(list) + all_more_results = defaultdict(list) - def evaluate_candidates(candidate_params, X, y, info=None): + def evaluate_candidates(candidate_params, X, y, more_results=None): candidate_params = list(candidate_params) n_candidates = len(candidate_params) @@ -680,13 +680,14 @@ def evaluate_candidates(candidate_params, X, y, info=None): all_candidate_params.extend(candidate_params) all_out.extend(out) - if info is not None: - for key, value in info.items(): - all_info[key].extend(value) + if more_results is not None: + for key, value in more_results.items(): + all_more_results[key].extend(value) nonlocal results results = self._format_results( - all_candidate_params, scorers, n_splits, all_out, all_info) + all_candidate_params, scorers, n_splits, all_out, + all_more_results) return results @@ -730,7 +731,8 @@ def evaluate_candidates(candidate_params, X, y, info=None): return self - def _format_results(self, candidate_params, scorers, n_splits, out, info): + def _format_results(self, candidate_params, scorers, n_splits, out, + more_results={}): n_candidates = len(candidate_params) # if one choose to see train score, "out" will contain train score info @@ -747,7 +749,7 @@ def _format_results(self, candidate_params, scorers, n_splits, out, info): if self.return_train_score: train_scores = _aggregate_score_dicts(train_score_dicts) - results = dict(info) + results = dict(more_results) def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" From 7d4cb56c166d2456efbd456bbd6c5bf0b4e015ab Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 26 Mar 2019 10:51:34 -0400 Subject: [PATCH 05/89] Passed grroups as well since we need n_to use get_n_splits(X, y, groups) --- sklearn/model_selection/_search.py | 15 ++++++++------- sklearn/model_selection/tests/test_search.py | 7 ++++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index c737a92788d33..4885586d9a694 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -548,7 +548,7 @@ def classes_(self): self._check_is_fitted("classes_") return self.best_estimator_.classes_ - def _run_search(self, evaluate_candidates, X, y): + def _run_search(self, evaluate_candidates, X, y, groups): """Repeatedly calls `evaluate_candidates` to conduct a search. This method, implemented in sub-classes, makes it possible to @@ -650,7 +650,8 @@ def fit(self, X, y=None, groups=None, **fit_params): all_out = [] all_more_results = defaultdict(list) - def evaluate_candidates(candidate_params, X, y, more_results=None): + def evaluate_candidates(candidate_params, X, y, groups, + more_results=None): candidate_params = list(candidate_params) n_candidates = len(candidate_params) @@ -692,7 +693,7 @@ def evaluate_candidates(candidate_params, X, y, more_results=None): return results - self._run_search(evaluate_candidates, X, y) + self._run_search(evaluate_candidates, X, y, groups) # For multi-metric evaluation, store the best_index_, best_params_ and # best_score_ iff refit is one of the scorer names @@ -1152,9 +1153,9 @@ def __init__(self, estimator, param_grid, scoring=None, self.param_grid = param_grid _check_param_grid(param_grid) - def _run_search(self, evaluate_candidates, X, y): + def _run_search(self, evaluate_candidates, X, y, groups): """Search all candidates in param_grid""" - evaluate_candidates(ParameterGrid(self.param_grid), X, y) + evaluate_candidates(ParameterGrid(self.param_grid), X, y, groups) class RandomizedSearchCV(BaseSearchCV): @@ -1470,8 +1471,8 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score) - def _run_search(self, evaluate_candidates, X, y): + def _run_search(self, evaluate_candidates, X, y, groups): """Search n_iter candidates from param_distributions""" evaluate_candidates(ParameterSampler( self.param_distributions, self.n_iter, - random_state=self.random_state), X, y) + random_state=self.random_state), X, y, groups) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 5b88edc407b87..762ddb9929ec5 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1661,11 +1661,12 @@ class CustomSearchCV(BaseSearchCV): def __init__(self, estimator, **kwargs): super().__init__(estimator, **kwargs) - def _run_search(self, evaluate, X, y): - results = evaluate([{'max_depth': 1}, {'max_depth': 2}], X, y) + def _run_search(self, evaluate, X, y, groups): + results = evaluate([{'max_depth': 1}, {'max_depth': 2}], + X, y, groups) check_results(results, fit_grid({'max_depth': [1, 2]})) results = evaluate([{'min_samples_split': 5}, - {'min_samples_split': 10}], X, y) + {'min_samples_split': 10}], X, y, groups) check_results(results, fit_grid([{'max_depth': [1, 2]}, {'min_samples_split': [5, 10]}])) From cdb6b509128379312062a5b3ff3d02f71b8fcfb3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 May 2019 15:58:33 -0400 Subject: [PATCH 06/89] port --- doc/modules/classes.rst | 2 + doc/modules/grid_search.rst | 195 ++++ .../plot_successive_halving_heatmap.py | 93 ++ .../plot_successive_halving_iterations.py | 50 + sklearn/model_selection/__init__.py | 5 + .../_search_successive_halving.py | 861 ++++++++++++++++++ .../tests/test_successive_halving.py | 276 ++++++ 7 files changed, 1482 insertions(+) create mode 100644 examples/model_selection/plot_successive_halving_heatmap.py create mode 100644 examples/model_selection/plot_successive_halving_iterations.py create mode 100644 sklearn/model_selection/_search_successive_halving.py create mode 100644 sklearn/model_selection/tests/test_successive_halving.py diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index ad120ed2d76b2..21ab478ced1bb 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1058,9 +1058,11 @@ Hyper-parameter optimizers :template: class.rst model_selection.GridSearchCV + model_selection.GridSuccessiveHalving model_selection.ParameterGrid model_selection.ParameterSampler model_selection.RandomizedSearchCV + model_selection.RandomSuccessiveHalving .. autosummary:: diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 3287ff95dcdcf..baaf92d789ab0 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -231,6 +231,201 @@ some parameter settings could be fully evaluated. Setting ``error_score=0`` warning and setting the score for that fold to 0 (or `NaN`), but completing the search. + +.. _successive_halving_user_guide: + +Searching optimal parameters with successive halving +==================================================== + +Scikit-learn also provides the :class:`GridSuccessiveHalving` and +:class:`RandomSuccessiveHalving` estimators that can be used to +search a parameter space using successive halving [1]_ [2]_. Successive +halving is an iterative selection process where all candidates are evaluated +with a small amount of resources at the first iteration. Only a subset of +these candidates are selected for the next iteration, which will be +allocated more resources. What defines a resource is typically the number of +samples to train on, or the number of trees for a gradient boosting / +decision forest estimator. + +As illustrated in the figure below, only a small subset of candidates 'survive' +until the last iteration. These are the candidates that have consistently been +part of the best candidates across all iterations. + +.. figure:: ../auto_examples/svm/images/sphx_glr_plot_successive_halving_iterations_001.png + :target: ../auto_examples/model_selection/plot_successive_halving_iterations.html + :align: center + +The amount of resources ``r_i`` allocated for each candidate at iteration +``i`` is controlled by the parameters ``ratio`` and ``r_min`` as follows:: + + r_i = ratio**i * r_min + +``r_min`` is the amount of resources used at the first iteration and +``ratio`` defines the proportions of candidates that will be selected for +the next iteration:: + + n_candidates_to_keep = n_candidates_at_i // ratio + +Note that each ``r_i`` is a multiple of both ``ratio`` and ``r_min``. + +Choosing the budget +------------------- + +By default, the budget is defined as the number of samples. That is, each +iteration will use an increasing amount of samples to train on. You can however +manually specify a parameter to use as the budget with the ``budget_on`` +parameter. Here is an example where the budget is defined as the number of +iterations of a random forest:: + + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + >>> import pandas as pd + >>> from dabl.search import GridSuccessiveHalving + >>> + >>> parameters = {'max_depth': [3, 5, 10], + ... 'min_samples_split': [2, 5, 10]} + >>> base_estimator = RandomForestClassifier(random_state=0) + >>> X, y = make_classification(n_samples=1000, random_state=0) + >>> sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + ... ratio=2, + ... budget_on='n_estimators', + ... max_budget=30, + ... random_state=0, + ... ).fit(X, y) + >>> sh.best_estimator_ + RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', + max_depth=5, max_features='auto', max_leaf_nodes=None, + min_impurity_decrease=0.0, min_impurity_split=None, + min_samples_leaf=1, min_samples_split=2, + min_weight_fraction_leaf=0.0, n_estimators=8, + n_jobs=None, oob_score=False, random_state=0, verbose=0, + warm_start=False) + +Note that it is not possible to budget on a parameter that is part of the +parameter space. + +Exhausting the budget +--------------------- + +As mentioned above, the first iteration uses ``r_min`` resources. If you have +a big budget, this may be a waste of resource:: + + >>> from sklearn.datasets import make_classification + >>> from sklearn.svm import SVC + >>> import pandas as pd + >>> from dabl.search import GridSuccessiveHalving + >>> parameters = {'kernel': ('linear', 'rbf'), + ... 'C': [1, 10, 100]} + >>> base_estimator = SVC(gamma='scale') + >>> X, y = make_classification(n_samples=1000) + >>> sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + ... ratio=2).fit(X, y) + >>> results = pd.DataFrame.from_dict(sh.cv_results_) + >>> results.groupby('iter').r_i.unique() + iter + 0 [20] + 1 [40] + 2 [80] + Name: r_i, dtype: object + +The search process will only use 80 resources at most, while our maximum budget +is ``n_samples=1000``. Note in this case that ``r_min = r_0 = 20``. In order +for the last iteration to use as many resources as possible, you can use the +``force_exhaust_budget`` parameter:: + + >>> sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + ... ratio=2, force_exhaust_budget=True, + ... ).fit(X, y) + >>> results = pd.DataFrame.from_dict(sh.cv_results_) + >>> results.groupby('iter').r_i.unique() + iter + 0 [250] + 1 [500] + 2 [1000] + Name: r_i, dtype: object + + +Since ``force_exhaust_budget`` chooses an appropriate ``r_min`` to start +with, ``r_min`` must be set to 'auto'. + +Aggressive elimination of candidates +------------------------------------ + +Ideally, we want the last iteration to evaluate ``ratio`` candidates. We then +just have to pick the best one. When the number budget is small with respect to +the number of candidates, the last iteration may have to evaluate more than +``ratio`` candidates.:: + >>> from sklearn.datasets import make_classification + >>> from sklearn.svm import SVC + >>> import pandas as pd + >>> from dabl.search import GridSuccessiveHalving + >>> + >>> + >>> parameters = {'kernel': ('linear', 'rbf'), + ... 'C': [1, 10, 100]} + >>> base_estimator = SVC(gamma='scale') + >>> X, y = make_classification(n_samples=1000) + >>> sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + ... ratio=2, + ... max_budget=40, + ... aggressive_elimination=False, + ... ).fit(X, y) + >>> results = pd.DataFrame.from_dict(sh.cv_results_) + >>> results.groupby('iter').r_i.unique() + iter + 0 [20] + 1 [40] + Name: r_i, dtype: object + >>> results.groupby('iter').r_i.count() # number of candidates used at each iteration + iter + 0 6 + 1 3 + Name: r_i, dtype: int64 + +Since we cannot use more than ``max_budget=40`` resources, the process has to +stop at the second iteration which evaluates more than ``ratio=2`` candidates. + +Using the ``aggressive_elimination`` parameter, you can force the search +process to end up with less than ``ratio`` candidates at the last +iteration. To do this, the process will eliminate as many candidates as +necessary using ``r_min`` resources:: + + >>> sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + ... ratio=2, + ... max_budget=40, + ... aggressive_elimination=True, + ... ).fit(X, y) + >>> results = pd.DataFrame.from_dict(sh.cv_results_) + >>> results.groupby('iter').r_i.unique() + iter + 0 [20] + 1 [20] + 2 [40] + Name: r_i, dtype: object + >>> results.groupby('iter').r_i.count() # number of candidates used at each iteration + iter + 0 6 + 1 3 + 2 2 + Name: r_i, dtype: int64 + +Notice that we end with 2 candidates at the last iteration since we have +eliminated enough candidates during the first iterations, using ``r_i = r_min = +20``. + + +.. topic:: References: + + .. [1] K. Jamieson, A. Talwalkar, + `Non-stochastic Best Arm Identification and Hyperparameter + Optimization `_, in + proc. of Machine Learning Research, 2016. + .. [2] L. Li, K. Jamieson, G. DeSalvo, A. Rostamizadeh, .A Talwalkar, + `Hyperband: A Novel Bandit-Based Approach to Hyperparameter Optimization + `_, in Machine Learning Research + 18, 2018. + + .. _alternative_cv: Alternatives to brute force parameter search diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py new file mode 100644 index 0000000000000..5886e5d1cbdcd --- /dev/null +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -0,0 +1,93 @@ +""" +Comparison between grid search and successive halving +===================================================== +""" +from time import time + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +from sklearn.svm import SVC +from sklearn import datasets +from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import GridSuccessiveHalving + + +rng = np.random.RandomState(0) +X, y = datasets.make_classification(n_samples=1000, random_state=rng) + +gammas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7] +Cs = [1, 10, 100, 1e3, 1e4, 1e5] +param_grid = {'gamma': gammas, 'C': Cs} + +clf = SVC(random_state=rng) +tic = time() +gsh = GridSuccessiveHalving( + estimator=clf, + param_grid=param_grid, + budget_on='n_samples', # budget is the number of samples + max_budget='auto', # max_budget=n_samples + force_exhaust_budget=True, + cv=5, + ratio=2, + random_state=rng) +gsh.fit(X, y) +gsh_time = time() - tic + +tic = time() +gs = GridSearchCV( + estimator=clf, + param_grid=param_grid, + cv=5) +gs.fit(X, y) +gs_time = time() - tic + + +def make_heatmap(ax, gs, show_iter=False, make_cbar=False): + results = pd.DataFrame.from_dict(gs.cv_results_) + results['params_str'] = results.params.apply(str) + scores = results.groupby(['param_gamma', 'param_C']).mean_test_score.max() + scores_matrix = scores.values.reshape(len(gammas), len(Cs)) + + im = ax.imshow(scores_matrix) + + ax.set_xticks(np.arange(len(Cs))) + ax.set_xticklabels(['{:.0E}'.format(x) for x in Cs]) + ax.set_xlabel('C', fontsize=15) + + ax.set_yticks(np.arange(len(gammas))) + ax.set_yticklabels(['{:.0E}'.format(x) for x in gammas]) + ax.set_ylabel('gamma', fontsize=15) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", + rotation_mode="anchor") + + if show_iter: + iterations = results.groupby(['param_gamma', 'param_C']).iter.max() + iterations_matrix = iterations.values.reshape(len(gammas), len(Cs)) + for i in range(len(gammas)): + for j in range(len(Cs)): + ax.text(j, i, iterations_matrix[i, j], + ha="center", va="center", color="w", fontsize=20) + + if make_cbar: + fig.subplots_adjust(right=0.8) + cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) + fig.colorbar(im, cax=cbar_ax) + cbar_ax.set_ylabel('max mean_test_score', rotation=-90, va="bottom", + fontsize=15) + + +fig, axes = plt.subplots(ncols=2) +ax1, ax2 = axes + +make_heatmap(ax1, gsh, show_iter=True) +make_heatmap(ax2, gs, make_cbar=True) + +ax1.set_title('Successive Halving (time = {:.3f}s)'.format(gsh_time), + fontsize=15) +ax2.set_title('GridSearch (time = {:.3f}s)'.format(gs_time), fontsize=15) + +plt.show() diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py new file mode 100644 index 0000000000000..0603c41c536a1 --- /dev/null +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -0,0 +1,50 @@ +""" +Successive Halving Iterations +============================= +""" +import pandas as pd +from sklearn import datasets +import matplotlib.pyplot as plt +from scipy.stats import randint +import numpy as np + +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import RandomSuccessiveHalving + + +rng = np.random.RandomState(0) + +X, y = datasets.make_classification(n_samples=700, random_state=rng) + +clf = RandomForestClassifier(n_estimators=20, random_state=rng) + +param_dist = {"max_depth": [3, None], + "max_features": randint(1, 11), + "min_samples_split": randint(2, 11), + "bootstrap": [True, False], + "criterion": ["gini", "entropy"]} + +rsh = RandomSuccessiveHalving( + estimator=clf, + param_distributions=param_dist, + budget_on='n_samples', # budget is the number of samples + max_budget='auto', # max_budget=n_samples + n_candidates='auto', # choose n_cdts so that last iter exhausts budget + cv=5, + ratio=2, + random_state=rng) +rsh.fit(X, y) + +results = pd.DataFrame(rsh.cv_results_) +results['params_str'] = results.params.apply(str) +mean_scores = results.pivot(index='iter', columns='params_str', + values='mean_test_score') +ax = mean_scores.plot(legend=False, alpha=.6) + +r_i_list = results.groupby('iter').r_i.unique() +labels = ['{}\nn_samples={}'.format(i, r_i_list[i]) + for i in range(rsh.n_iterations_)] +ax.set_xticklabels(labels) +ax.set_title('Candidate scores over iterations') +ax.set_ylabel('score') +plt.show() diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index 82a9b9371710d..96ac1077e53f4 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -29,8 +29,12 @@ from ._search import ParameterSampler from ._search import fit_grid_point +from ._search_successive_halving import GridSuccessiveHalving +from ._search_successive_halving import RandomSuccessiveHalving + __all__ = ('BaseCrossValidator', 'GridSearchCV', + 'GridSuccessiveHalving', 'TimeSeriesSplit', 'KFold', 'GroupKFold', @@ -45,6 +49,7 @@ 'ParameterSampler', 'PredefinedSplit', 'RandomizedSearchCV', + 'RandomizedSuccessiveHalving', 'ShuffleSplit', 'StratifiedKFold', 'StratifiedShuffleSplit', diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py new file mode 100644 index 0000000000000..5373e8b8ff766 --- /dev/null +++ b/sklearn/model_selection/_search_successive_halving.py @@ -0,0 +1,861 @@ +from math import ceil, floor, log +from abc import abstractmethod + +import numpy as np +from ._search import _check_param_grid +from ._search import BaseSearchCV +from . import ParameterGrid, ParameterSampler +from ..utils import check_random_state +from ..utils.validation import _num_samples +from ..base import is_classifier +from ._split import check_cv +from ..utils import resample + + +__all__ = ['GridSuccessiveHalving', 'RandomSuccessiveHalving'] + + +def _refit_callable(results): + # Custom refit callable to return the index of the best candidate. We want + # the best candidate out of the last iteration. By default BaseSearchCV + # would return the best candidate out of all iterations. + + last_iter = np.max(results['iter']) + sorted_indices = np.argsort(results['mean_test_score'])[::-1] + best_index = next(i for i in sorted_indices + if results['iter'][i] == last_iter) + return best_index + + +class BaseSuccessiveHalving(BaseSearchCV): + """Implements successive halving. + + Ref: + Almost optimal exploration in multi-armed bandits, ICML 13 + Zohar Karnin, Tomer Koren, Oren Somekh + """ + def __init__(self, estimator, scoring=None, + n_jobs=None, refit=True, cv=5, verbose=0, + pre_dispatch='2*n_jobs', random_state=None, + error_score=np.nan, return_train_score=True, + max_budget='auto', budget_on='n_samples', ratio=3, + r_min='auto', aggressive_elimination=False, + force_exhaust_budget=False): + + refit = _refit_callable if refit else False + super().__init__(estimator, scoring=scoring, + n_jobs=n_jobs, refit=refit, cv=cv, + verbose=verbose, pre_dispatch=pre_dispatch, + error_score=error_score, + return_train_score=return_train_score) + + self.random_state = random_state + self.max_budget = max_budget + self.budget_on = budget_on + self.ratio = ratio + self.r_min = r_min + self.aggressive_elimination = aggressive_elimination + self.force_exhaust_budget = force_exhaust_budget + + def _check_input_parameters(self, X, y, groups): + + if self.scoring is not None and not (isinstance(self.scoring, str) + or callable(self.scoring)): + raise ValueError('scoring parameter must be a string, ' + 'a callable or None.') + + if (self.budget_on != 'n_samples' + and self.budget_on not in self.estimator.get_params()): + raise ValueError( + 'Cannot budget on parameter {} which is not supported ' + 'by estimator {}'.format(self.budget_on, + self.estimator.__class__.__name__)) + + if isinstance(self.max_budget, str) and self.max_budget != 'auto': + raise ValueError( + "max_budget must be either 'auto' or a positive number" + ) + if self.max_budget != 'auto' and self.max_budget <= 0: + raise ValueError( + "max_budget must be either 'auto' or a positive number" + ) + + if isinstance(self.r_min, str) and self.r_min != 'auto': + raise ValueError( + "r_min must be either 'auto' or a positive number no greater " + "than max_budget." + ) + if self.r_min != 'auto' and self.r_min <= 0: + raise ValueError( + "r_min must be either 'auto' or a positive number no greater " + "than max_budget." + ) + + if self.force_exhaust_budget and self.r_min != 'auto': + raise ValueError( + 'r_min must be set to auto if force_exhaust_budget is True.' + ) + + self.r_min_ = self.r_min + if self.r_min_ == 'auto': + if self.budget_on == 'n_samples': + cv = check_cv(self.cv, y, + classifier=is_classifier(self.estimator)) + n_splits = cv.get_n_splits(X, y, groups) + + # please see https://gph.is/1KjihQe for a justification + magic_factor = 2 + self.r_min_ = n_splits * magic_factor + if is_classifier(self.estimator): + n_classes = np.unique(y).shape[0] + self.r_min_ *= n_classes + else: + self.r_min_ = 1 + + self.max_budget_ = self.max_budget + if self.max_budget_ == 'auto': + if not self.budget_on == 'n_samples': + raise ValueError( + "max_budget can only be 'auto' if budget_on='n_samples'") + self.max_budget_ = _num_samples(X) + + if self.r_min_ > self.max_budget_: + raise ValueError( + 'r_min_={} is greater than max_budget_={}.' + .format(self.r_min_, self.max_budget_) + ) + + def fit(self, X, y=None, groups=None, **fit_params): + self._check_input_parameters( + X=X, + y=y, + groups=groups, + ) + super().fit(X, y=y, groups=groups, **fit_params) + # Set best_score_: BaseSearchCV does not set it, as refit is a callable + self.best_score_ = ( + self.cv_results_['mean_test_score'][self.best_index_]) + return self + + def _run_search(self, evaluate_candidates, X, y, groups): + rng = check_random_state(self.random_state) + + candidate_params = self._generate_candidate_params() + # Remove duplicates (may happen with random sampling) + candidate_params = set(tuple(d.items()) for d in candidate_params) + candidate_params = [dict(t) for t in candidate_params] + self.n_candidates_ = len(candidate_params) + + if self.budget_on != 'n_samples' and any( + self.budget_on in candidate for candidate in candidate_params): + # Can only check this now since we need the candidates list + raise ValueError( + "Cannot budget on parameter {} since it is part of " + "the searched parameters.".format(self.budget_on)) + + # n_required_iterations is the number of iterations needed so that the + # last iterations evaluates less than `ratio` candidates. + n_required_iterations = 1 + floor(log(self.n_candidates_, self.ratio)) + + if self.force_exhaust_budget: + # To exhaust the budget, we want to start with the biggest r_min + # possible so that the last (required) iteration uses as many + # resources as possible + # We only force exhausting the budget if r_min wasn't specified by + # the user. + last_iteration = n_required_iterations - 1 + self.r_min_ = max(self.r_min_, + self.max_budget_ // self.ratio**last_iteration) + + # n_possible iterations is the number of iterations that we can + # actually do starting from r_min and without exceeding the budget. + # Depending on budget size the number of candidates, this may be higher + # or smaller than n_required_iterations. + n_possible_iterations = 1 + floor(log(self.max_budget_ // self.r_min_, + self.ratio)) + + if self.aggressive_elimination: + n_iterations = n_required_iterations + else: + n_iterations = min(n_possible_iterations, n_required_iterations) + + if self.verbose: + print('n_iterations: {}'.format(n_iterations)) + print('n_required_iterations: {}'.format(n_required_iterations)) + print('n_possible_iterations: {}'.format(n_possible_iterations)) + print('r_min_: {}'.format(self.r_min_)) + print('max_budget_: {}'.format(self.max_budget_)) + print('aggressive_elimination: {}'.format( + self.aggressive_elimination)) + print('force_exhaust_budget: {}'.format(self.force_exhaust_budget)) + print('ratio: {}'.format(self.ratio)) + + self._r_i_list = [] # list of r_i for each iteration, used in tests + + for iter_i in range(n_iterations): + + power = iter_i # default + if self.aggressive_elimination: + # this will set r_i to the initial value (i.e. the value of r_i + # at the first iteration) for as many iterations as needed + # (while candidates are being eliminated), and then go on as + # usual. + power = max( + 0, + iter_i - n_required_iterations + n_possible_iterations + ) + + r_i = int(self.ratio**power * self.r_min_) + r_i = min(r_i, self.max_budget_) # guard, probably not needed + self._r_i_list.append(r_i) + + n_candidates = len(candidate_params) + + if self.verbose: + print('-' * 10) + print('iter_i: {}'.format(iter_i)) + print('n_candidates: {}'.format(n_candidates)) + print('r_i: {}'.format(r_i)) + + if self.budget_on == 'n_samples': + stratify = y if is_classifier(self.estimator) else None + X_iter, y_iter = resample(X, y, replace=False, + random_state=rng, stratify=stratify, + n_samples=r_i) + else: + # Need copy so that r_i of next iteration do not overwrite + candidate_params = [c.copy() for c in candidate_params] + for candidate in candidate_params: + candidate[self.budget_on] = r_i + X_iter, y_iter = X, y + more_results = {'iter': [iter_i] * n_candidates, + 'r_i': [r_i] * n_candidates} + results = evaluate_candidates(candidate_params, X_iter, y_iter, + groups, more_results=more_results) + + n_candidates_to_keep = ceil(n_candidates / self.ratio) + candidate_params = self._top_k(results, + n_candidates_to_keep, + iter_i) + + self.n_remaining_candidates_ = len(candidate_params) + self.n_required_iterations_ = n_required_iterations + self.n_possible_iterations_ = n_possible_iterations + self.n_iterations_ = n_iterations + + def _top_k(self, results, k, iter_i): + # Return the best candidates of a given iteration + # We need to filter out candidates from the previous iterations + # when sorting + + best_candidates_indices = np.argsort(results['mean_test_score'])[::-1] + best_candidates_indices = [idx for idx in best_candidates_indices + if results['iter'][idx] == iter_i] + best_candidates_indices = best_candidates_indices[:k] + return [results['params'][idx] for idx in best_candidates_indices] + + @abstractmethod + def _generate_candidate_params(self): + pass + + +class GridSuccessiveHalving(BaseSuccessiveHalving): + """Search over specified parameter values with successive halving. + + The search strategy starts evaluating all the candidates with a small + amount a resource and iteratively selects the best candidates, using more + and more resources. + + Read more in the :ref:`User guide`. + + Parameters + ---------- + estimator : estimator object. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_grid : dict or list of dictionaries + Dictionary with parameters names (string) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + scoring : string, callable, or None, default: None + A single string (see :ref:`scoring_parameter`) or a callable + (see :ref:`scoring`) to evaluate the predictions on the test set. + If None, the estimator's score method is used. + + n_jobs : int or None, optional (default=None) + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + pre_dispatch : int, or string, optional + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A string, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + cv : int, cross-validation generator or an iterable, optional (default=5) + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + refit : boolean, default=True + If True, refit an estimator using the best found parameters on the + whole dataset. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``GridSearchCV`` instance. + + verbose : integer + Controls the verbosity: the higher, the more messages. + + error_score : 'raise' or numeric + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. Default is ``np.nan`` + + return_train_score : boolean, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + max_budget : int, optional(default='auto') + The maximum number of resources that any candidate is allowed to use + for a given iteration. By default, this is set ``n_samples`` when + ``budget_on='n_samples'`` (default), else an error is raised. + + budget_on : `n_samples` or str, optional(default='n_samples') + Defines the nature of the budget. By default, the budget is the number + of samples. It can also be set to any parameter of the base estimator + that accepts positive integer values, e.g. 'n_iterations' or + 'n_estimators' for a gradient boosting estimator. In this case + ``max_budget`` cannot be 'auto'. + + ratio : int or float, optional(default=3) + The 'halving' parameter, which determines the proportion of candidates + that are selected for the next iteration. For example, ``ratio=3`` + means that only one third of the candidates are selected. + + r_min : int, optional(default='auto') + The minimum amount of resource that any candidate is allowed to use for + a given iteration. Equivalently, this defines the amount of resources + that are allocated for each candidate at the first iteration. By + default, this is set to: + + - ``n_splits * 2`` when ``budget_on='n_samples'`` for a regression + problem + - ``n_classes * n_splits * 2`` when ``budget_on='n_samples'`` for a + regression problem + - The highest possible value satisfying the constraint + ``force_exhaust_budget=True``. + - ``1`` when ``budget_on!='n_samples'`` + + Note that the amount of resources used at each iteration is always a + multiple of ``r_min``. + + aggressive_elimination : bool, optional(default=False) + This is only relevant in cases where there isn't enough budget to + eliminate enough candidates at the last iteration. If ``True``, then + the search process will 'replay' the first iteration for as long as + needed until the number of candidates is small enough. This is + ``False`` by default, which means that the last iteration may evaluate + more than ``ratio`` candidates. + + force_exhaust_budget : bool, optional(default=False) + If True, then ``r_min`` is set to a specific value such that the + last iteration uses as much budget as possible. Namely, the last + iteration uses the highest value smaller than ``max_budget`` that is a + multiple of both ``r_min`` and ``ratio``. + + Attributes + ---------- + n_candidates_ : int + The number of candidate parameters that were evaluated at the first + iteartion. + + n_remaining_candidates_ : int + The number of candidate parameters that are left after the last + iteration. + + max_budget_ : int + The maximum number of resources that any candidate is allowed to use + for a given iteration. Note that since the number of resources used at + each iteration must be a multiple of ``r_min_``, the actual number of + resources used at the last iteartion may be smaller than + ``max_budget_``. + + r_min_ : int + The amount of resources that are allocated for each candidate at the + first iteration. + + n_iterations_ : int + The actual number of iterations that were run. This is equal to + ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``. + Else, this is equal to ``min(n_possible_iterations_, + n_required_iterations_)``. + + n_possible_iterations_ : int + The number of iterations that are possible starting with ``r_min_`` + resources and without exceeding ``max_budget_``. + + n_required_iterations_ : int + The number of iterations that are required to end up with less than + ``ratio`` candidates at the last iteration, starting with ``r_min_`` + resources. This will be smaller than ``n_possible_iterations_`` when + there isn't enough budget. + + cv_results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. + + For instance the below given table + + +--------------+-------------+-------------------+---+---------------+ + | param_kernel | param_gamma | split0_test_score |...|rank_test_score| + +==============+=============+===================+===+===============+ + | 'rbf' | 0.1 | 0.80 |...| 2 | + +--------------+-------------+-------------------+---+---------------+ + | 'rbf' | 0.2 | 0.90 |...| 1 | + +--------------+-------------+-------------------+---+---------------+ + | 'rbf' | 0.3 | 0.70 |...| 1 | + +--------------+-------------+-------------------+---+---------------+ + + will be represented by a ``cv_results_`` dict of:: + + { + 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'], + mask = False), + 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False), + 'split0_test_score' : [0.80, 0.90, 0.70], + 'split1_test_score' : [0.82, 0.50, 0.70], + 'mean_test_score' : [0.81, 0.70, 0.70], + 'std_test_score' : [0.01, 0.20, 0.00], + 'rank_test_score' : [3, 1, 1], + 'split0_train_score' : [0.80, 0.92, 0.70], + 'split1_train_score' : [0.82, 0.55, 0.70], + 'mean_train_score' : [0.81, 0.74, 0.70], + 'std_train_score' : [0.01, 0.19, 0.00], + 'mean_fit_time' : [0.73, 0.63, 0.43], + 'std_fit_time' : [0.01, 0.02, 0.01], + 'mean_score_time' : [0.01, 0.06, 0.04], + 'std_score_time' : [0.00, 0.00, 0.00], + 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...], + } + + NOTE + + The key ``'params'`` is used to store a list of parameter + settings dicts for all the parameter candidates. + + The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and + ``std_score_time`` are all in seconds. + + best_estimator_ : estimator or dict + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if ``refit=False``. + + best_score_ : float + Mean cross-validated score of the best_estimator. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + + best_index_ : int + The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + + scorer_ : function or a dict + Scorer function used on the held out data to choose the best + parameters for the model. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + + refit_time_ : float + Seconds used for refitting the best model on the whole dataset. + + This is present only if ``refit`` is not False. + + Notes + ----- + The parameters selected are those that maximize the score of the held-out + data, according to the scoring parameter. + + If `n_jobs` was set to a value higher than one, the data is copied for each + parameter setting(and not `n_jobs` times). This is done for efficiency + reasons if individual jobs take very little time, but may raise errors if + the dataset is large and not enough memory is available. A workaround in + this case is to set `pre_dispatch`. Then, the memory is copied only + `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * + n_jobs`. + + See Also + -------- + :class:`RandomSuccessiveHalving`: + Random search over a set of parameters using successive halving. + """ + + def __init__(self, estimator, param_grid, scoring=None, + n_jobs=None, refit=True, verbose=0, cv=5, + pre_dispatch='2*n_jobs', random_state=None, + error_score=np.nan, return_train_score=True, + max_budget='auto', budget_on='n_samples', ratio=3, + r_min='auto', aggressive_elimination=False, + force_exhaust_budget=False): + super().__init__(estimator, scoring=scoring, + n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, + pre_dispatch=pre_dispatch, + random_state=random_state, error_score=error_score, + return_train_score=return_train_score, + max_budget=max_budget, budget_on=budget_on, + ratio=ratio, r_min=r_min, + aggressive_elimination=aggressive_elimination, + force_exhaust_budget=force_exhaust_budget) + self.param_grid = param_grid + _check_param_grid(self.param_grid) + + def _generate_candidate_params(self): + return ParameterGrid(self.param_grid) + + +class RandomSuccessiveHalving(BaseSuccessiveHalving): + """Randomized search on hyper parameters. + + The search strategy starts evaluating all the candidates with a small + amount a resource and iteratively selects the best candidates, using more + and more resources. + + Read more in the :ref:`User guide`. + + Parameters + ---------- + estimator : estimator object. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_distributions : dict + Dictionary with parameters names (string) as keys and distributions + or lists of parameters to try. Distributions must provide a ``rvs`` + method for sampling (such as those from scipy.stats.distributions). + If a list is given, it is sampled uniformly. + + n_candidates: int, optional(default='auto') + The number of candidate parameters to sample. By default this will + sample enough candidates so that the last iteration uses as many + resources as possible. Note that ``force_exhaust_budget`` has no + effect in this case. + + scoring : string, callable, or None, default: None + A single string (see :ref:`scoring_parameter`) or a callable + (see :ref:`scoring`) to evaluate the predictions on the test set. + If None, the estimator's score method is used. + + n_jobs : int or None, optional (default=None) + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + pre_dispatch : int, or string, optional + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A string, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + cv : int, cross-validation generator or an iterable, optional (default=5) + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + refit : boolean, default=True + If True, refit an estimator using the best found parameters on the + whole dataset. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``GridSearchCV`` instance. + + verbose : integer + Controls the verbosity: the higher, the more messages. + + error_score : 'raise' or numeric + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. Default is ``np.nan`` + + return_train_score : boolean, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + max_budget : int, optional(default='auto') + The maximum number of resources that any candidate is allowed to use + for a given iteration. By default, this is set ``n_samples`` when + ``budget_on='n_samples'`` (default), else an error is raised. + + budget_on : `n_samples` or str, optional(default='n_samples') + Defines the nature of the budget. By default, the budget is the number + of samples. It can also be set to any parameter of the base estimator + that accepts positive integer values, e.g. 'n_iterations' or + 'n_estimators' for a gradient boosting estimator. In this case + ``max_budget`` cannot be 'auto'. + + ratio : int or float, optional(default=3) + The 'halving' parameter, which determines the proportion of candidates + that are selected for the next iteration. For example, ``ratio=3`` + means that only one third of the candidates are selected. + + r_min : int, optional(default='auto') + The minimum amount of resource that any candidate is allowed to use for + a given iteration. Equivalently, this defines the amount of resources + that are allocated for each candidate at the first iteration. By + default, this is set to: + + - ``n_splits * 2`` when ``budget_on='n_samples'`` for a regression + problem + - ``n_classes * n_splits * 2`` when ``budget_on='n_samples'`` for a + regression problem + - The highest possible value satisfying the constraint + ``force_exhaust_budget=True``. + - ``1`` when ``budget_on!='n_samples'`` + + Note that the amount of resources used at each iteration is always a + multiple of ``r_min``. + + aggressive_elimination : bool, optional(default=False) + This is only relevant in cases where there isn't enough budget to + eliminate enough candidates at the last iteration. If ``True``, then + the search process will 'replay' the first iteration for as long as + needed until the number of candidates is small enough. This is + ``False`` by default, which means that the last iteration may evaluate + more than ``ratio`` candidates. + + force_exhaust_budget : bool, optional(default=False) + If True, then ``r_min`` is set to a specific value such that the + last iteration uses as much budget as possible. Namely, the last + iteration uses the highest value smaller than ``max_budget`` that is a + multiple of both ``r_min`` and ``ratio``. + + Attributes + ---------- + n_candidates_ : int + The number of candidate parameters that were evaluated at the first + iteartion. + + n_remaining_candidates_ : int + The number of candidate parameters that are left after the last + iteration. + + max_budget_ : int + The maximum number of resources that any candidate is allowed to use + for a given iteration. Note that since the number of resources used at + each iteration must be a multiple of ``r_min_``, the actual number of + resources used at the last iteartion may be smaller than + ``max_budget_``. + + r_min_ : int + The amount of resources that are allocated for each candidate at the + first iteration. + + n_iterations_ : int + The actual number of iterations that were run. This is equal to + ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``. + Else, this is equal to ``min(n_possible_iterations_, + n_required_iterations_)``. + + n_possible_iterations_ : int + The number of iterations that are possible starting with ``r_min_`` + resources and without exceeding ``max_budget_``. + + n_required_iterations_ : int + The number of iterations that are required to end up with less than + ``ratio`` candidates at the last iteration, starting with ``r_min_`` + resources. This will be smaller than ``n_possible_iterations_`` when + there isn't enough budget. + + cv_results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. + + For instance the below given table + + +--------------+-------------+-------------------+---+---------------+ + | param_kernel | param_gamma | split0_test_score |...|rank_test_score| + +==============+=============+===================+===+===============+ + | 'rbf' | 0.1 | 0.80 |...| 2 | + +--------------+-------------+-------------------+---+---------------+ + | 'rbf' | 0.2 | 0.90 |...| 1 | + +--------------+-------------+-------------------+---+---------------+ + | 'rbf' | 0.3 | 0.70 |...| 1 | + +--------------+-------------+-------------------+---+---------------+ + + will be represented by a ``cv_results_`` dict of:: + + { + 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'], + mask = False), + 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False), + 'split0_test_score' : [0.80, 0.90, 0.70], + 'split1_test_score' : [0.82, 0.50, 0.70], + 'mean_test_score' : [0.81, 0.70, 0.70], + 'std_test_score' : [0.01, 0.20, 0.00], + 'rank_test_score' : [3, 1, 1], + 'split0_train_score' : [0.80, 0.92, 0.70], + 'split1_train_score' : [0.82, 0.55, 0.70], + 'mean_train_score' : [0.81, 0.74, 0.70], + 'std_train_score' : [0.01, 0.19, 0.00], + 'mean_fit_time' : [0.73, 0.63, 0.43], + 'std_fit_time' : [0.01, 0.02, 0.01], + 'mean_score_time' : [0.01, 0.06, 0.04], + 'std_score_time' : [0.00, 0.00, 0.00], + 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...], + } + + NOTE + + The key ``'params'`` is used to store a list of parameter + settings dicts for all the parameter candidates. + + The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and + ``std_score_time`` are all in seconds. + + best_estimator_ : estimator or dict + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if ``refit=False``. + + best_score_ : float + Mean cross-validated score of the best_estimator. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + + best_index_ : int + The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + + scorer_ : function or a dict + Scorer function used on the held out data to choose the best + parameters for the model. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + + refit_time_ : float + Seconds used for refitting the best model on the whole dataset. + + This is present only if ``refit`` is not False. + + Notes + ----- + The parameters selected are those that maximize the score of the held-out + data, according to the scoring parameter. + + If `n_jobs` was set to a value higher than one, the data is copied for each + parameter setting(and not `n_jobs` times). This is done for efficiency + reasons if individual jobs take very little time, but may raise errors if + the dataset is large and not enough memory is available. A workaround in + this case is to set `pre_dispatch`. Then, the memory is copied only + `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * + n_jobs`. + + See Also + -------- + :class:`GridSuccessiveHalving`: + Search over a grid of parameters using successive halving. + """ + + def __init__(self, estimator, param_distributions, + n_candidates='auto', scoring=None, n_jobs=None, refit=True, + verbose=0, cv=5, pre_dispatch='2*n_jobs', + random_state=None, error_score=np.nan, + return_train_score=True, max_budget='auto', + budget_on='n_samples', ratio=3, r_min='auto', + aggressive_elimination=False, force_exhaust_budget=False): + super().__init__(estimator, scoring=scoring, + n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, + random_state=random_state, error_score=error_score, + return_train_score=return_train_score, + max_budget=max_budget, budget_on=budget_on, + ratio=ratio, r_min=r_min, + aggressive_elimination=aggressive_elimination, + force_exhaust_budget=force_exhaust_budget) + self.param_distributions = param_distributions + self.n_candidates = n_candidates + + def _generate_candidate_params(self): + n_candidates_ = self.n_candidates + if n_candidates_ == 'auto': + # This will generate enough candidate so that the last iteration + # uses as much budget as possible + n_candidates_ = self.max_budget_ // self.r_min_ + return ParameterSampler(self.param_distributions, n_candidates_, + self.random_state) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py new file mode 100644 index 0000000000000..22ecbcc1881c5 --- /dev/null +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -0,0 +1,276 @@ +import pytest +from scipy.stats import norm + +from sklearn.datasets import make_classification +from sklearn.dummy import DummyClassifier +from sklearn.model_selection import GridSuccessiveHalving +from sklearn.model_selection import RandomSuccessiveHalving + + +class FastClassifier(DummyClassifier): + """Dummy classifier that accepts parameters a, b, ... z. + + These parameter don't affect the predictions and are useful for fast + grid searching.""" + + def __init__(self, strategy='stratified', random_state=None, + constant=None, **kwargs): + super().__init__(strategy=strategy, random_state=random_state, + constant=constant) + + def get_params(self, deep=False): + params = super().get_params(deep=deep) + for char in range(ord('a'), ord('z') + 1): + params[chr(char)] = 'whatever' + return params + + +def test_aggressive_elimination(): + # Test the aggressive_elimination parameter. + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + parameters = {'a': ('l1', 'l2'), 'b': list(range(30))} + base_estimator = FastClassifier() + ratio = 3 + + # aggressive_elimination is only really relevant when there is not enough + # budget. + max_budget = 180 + + # aggressive_elimination=True + # In this case, the first iterations only use r_min_ resources + sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + aggressive_elimination=True, + max_budget=max_budget, ratio=ratio) + sh.fit(X, y) + assert sh.n_iterations_ == 4 + assert sh.n_required_iterations_ == 4 + assert sh.n_possible_iterations_ == 3 + assert sh._r_i_list == [20, 20, 60, 180] # see how it loops at the start + assert sh.n_remaining_candidates_ == 1 + + # Make sure we get the same results with randomized search + sh = RandomSuccessiveHalving(base_estimator, parameters, + n_candidates=60, cv=5, + aggressive_elimination=True, + max_budget=max_budget, ratio=ratio) + sh.fit(X, y) + assert sh.n_iterations_ == 4 + assert sh.n_required_iterations_ == 4 + assert sh.n_possible_iterations_ == 3 + assert sh._r_i_list == [20, 20, 60, 180] # see how it loops at the start + assert sh.n_remaining_candidates_ == 1 + + # aggressive_elimination=False + # In this case we don't loop at the start, and might end up with a lot of + # candidates at the last iteration + sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + aggressive_elimination=False, + max_budget=max_budget, ratio=ratio) + sh.fit(X, y) + + assert sh.n_iterations_ == 3 + assert sh.n_required_iterations_ == 4 + assert sh.n_possible_iterations_ == 3 + assert sh._r_i_list == [20, 60, 180] + assert sh.n_remaining_candidates_ == 3 + + max_budget = n_samples + # with enough budget, aggressive_elimination has no effect since it is not + # needed + + # aggressive_elimination=True + sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + aggressive_elimination=True, + max_budget=max_budget, ratio=ratio) + sh.fit(X, y) + + assert sh.n_iterations_ == 4 + assert sh.n_required_iterations_ == 4 + assert sh.n_possible_iterations_ == 4 + assert sh._r_i_list == [20, 60, 180, 540] + assert sh.n_remaining_candidates_ == 1 + + # aggressive_elimination=False + sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + aggressive_elimination=False, + max_budget=max_budget, ratio=ratio) + sh.fit(X, y) + + assert sh.n_iterations_ == 4 + assert sh.n_required_iterations_ == 4 + assert sh.n_possible_iterations_ == 4 + assert sh._r_i_list == [20, 60, 180, 540] + assert sh.n_remaining_candidates_ == 1 + + +def test_force_exhaust_budget_false(): + # Test the force_exhaust_budget parameter when it's false or ignored. + # This is the default case: we start at the beginning no matter what since + # we do not overwrite r_min_ + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + parameters = {'a': [1, 2], 'b': [1, 2, 3]} + base_estimator = FastClassifier() + ratio = 3 + + # with enough budget + sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + force_exhaust_budget=False, ratio=ratio) + sh.fit(X, y) + assert sh.n_iterations_ == 2 + assert sh.n_required_iterations_ == 2 + assert sh.n_possible_iterations_ == 4 + assert sh._r_i_list == [20, 60] + + # with enough budget but r_min!='auto': ignored + sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + force_exhaust_budget=False, ratio=ratio, + r_min=50) + sh.fit(X, y) + assert sh.n_iterations_ == 2 + assert sh.n_required_iterations_ == 2 + assert sh.n_possible_iterations_ == 3 + assert sh._r_i_list == [50, 150] + + # without enough budget (budget is exhausted anyway) + sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + force_exhaust_budget=False, ratio=ratio, + max_budget=30) + sh.fit(X, y) + assert sh.n_iterations_ == 1 + assert sh.n_required_iterations_ == 2 + assert sh.n_possible_iterations_ == 1 + assert sh._r_i_list == [20] + + +@pytest.mark.parametrize('max_budget, r_i_list', [ + ('auto', [333, 999]), + (1000, [333, 999]), + (999, [333, 999]), + (600, [200, 600]), + (599, [199, 597]), + (300, [100, 300]), + (60, [20, 60]), + (50, [20]), + (20, [20]), +]) +def test_force_exhaust_budget_true(max_budget, r_i_list): + # Test the force_exhaust_budget parameter when it's true + # in this case we need to change r_min so that the last iteration uses as + # much budget as possible + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + parameters = {'a': [1, 2], 'b': [1, 2, 3]} + base_estimator = FastClassifier() + ratio = 3 + sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + force_exhaust_budget=True, ratio=ratio, + max_budget=max_budget) + sh.fit(X, y) + + assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh._r_i_list) + assert sh._r_i_list == r_i_list + + # Test same for randomized search + sh = RandomSuccessiveHalving(base_estimator, parameters, n_candidates=6, + cv=5, force_exhaust_budget=True, + ratio=ratio, max_budget=max_budget) + sh.fit(X, y) + + assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh._r_i_list) + assert sh._r_i_list == r_i_list + + +@pytest.mark.parametrize( + 'max_budget, n_iterations, n_possible_iterations', [ + ('auto', 5, 9), # whole budget is used + (1024, 5, 9), + (700, 5, 8), + (512, 5, 8), + (511, 5, 7), + (32, 4, 4), + (31, 3, 3), + (16, 3, 3), + (4, 1, 1), # max_budget == r_min, only one iteration is possible + ]) +def test_n_iterations(max_budget, n_iterations, n_possible_iterations): + # test the number of actual iterations that were run depending on + # max_budget + + n_samples = 1024 + X, y = make_classification(n_samples=n_samples, random_state=1) + parameters = {'a': [1, 2], 'b': list(range(10))} + base_estimator = FastClassifier() + ratio = 2 + + sh = GridSuccessiveHalving(base_estimator, parameters, cv=2, ratio=ratio, + max_budget=max_budget, r_min=4) + sh.fit(X, y) + assert sh.n_required_iterations_ == 5 + assert sh.n_iterations_ == n_iterations + assert sh.n_possible_iterations_ == n_possible_iterations + + +def test_budget_on(): + # Test the budget_on parameter + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + parameters = {'a': [1, 2], 'b': list(range(10))} + base_estimator = FastClassifier() + sh = GridSuccessiveHalving(base_estimator, parameters, cv=2, + budget_on='c', max_budget=10, ratio=3) + sh.fit(X, y) + assert set(sh._r_i_list) == set([1, 3, 9]) + for r_i, params, param_c in zip(sh.cv_results_['r_i'], + sh.cv_results_['params'], + sh.cv_results_['param_c']): + assert r_i == params['c'] == param_c + + with pytest.raises( + ValueError, + match='Cannot budget on parameter 1234 which is not supported '): + sh = GridSuccessiveHalving(base_estimator, parameters, cv=2, + budget_on='1234', max_budget=10) + sh.fit(X, y) + + with pytest.raises( + ValueError, + match='Cannot budget on parameter c since it is part of the ' + 'searched parameters.'): + parameters = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]} + sh = GridSuccessiveHalving(base_estimator, parameters, cv=2, + budget_on='c', max_budget=10) + sh.fit(X, y) + + +@pytest.mark.parametrize( + 'max_budget, n_candidates, expected_n_candidates_', [ + (512, 'auto', 128), # generate exactly as much as needed + (32, 'auto', 8), + (32, 8, 8), + (32, 7, 7), # ask for less than what we could + (32, 9, 9), # ask for more than 'reasonable' + ]) +def test_random_search(max_budget, n_candidates, expected_n_candidates_): + # Test random search and make sure the number of generated candidates is as + # expected + + n_samples = 1024 + X, y = make_classification(n_samples=n_samples, random_state=0) + parameters = {'a': norm, 'b': norm} + base_estimator = FastClassifier() + sh = RandomSuccessiveHalving(base_estimator, parameters, + n_candidates=n_candidates, + cv=2, + max_budget=max_budget, ratio=2, r_min=4) + sh.fit(X, y) + assert sh.n_candidates_ == expected_n_candidates_ + if n_candidates == 'auto': + # Make sure 'auto' makes the last iteration use as much budget as we + # can + assert sh._r_i_list[-1] == max_budget From ab2955459935e8ecc86866e2afa0561d7d52ffd8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 May 2019 16:00:13 -0400 Subject: [PATCH 07/89] pep8 --- sklearn/model_selection/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index 96ac1077e53f4..18cb5774482f7 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -49,7 +49,7 @@ 'ParameterSampler', 'PredefinedSplit', 'RandomizedSearchCV', - 'RandomizedSuccessiveHalving', + 'RandomSuccessiveHalving', 'ShuffleSplit', 'StratifiedKFold', 'StratifiedShuffleSplit', From c725feebcc320becbfb03ac1004fb5810e61039f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 May 2019 16:17:56 -0400 Subject: [PATCH 08/89] dabl -> sklearn --- doc/modules/grid_search.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index baaf92d789ab0..5f45c4e1cfdcc 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -279,8 +279,8 @@ iterations of a random forest:: >>> from sklearn.datasets import make_classification >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.model_selection import GridSuccessiveHalving >>> import pandas as pd - >>> from dabl.search import GridSuccessiveHalving >>> >>> parameters = {'max_depth': [3, 5, 10], ... 'min_samples_split': [2, 5, 10]} @@ -312,8 +312,8 @@ a big budget, this may be a waste of resource:: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC + >>> from sklearn.model_selection import GridSuccessiveHalving >>> import pandas as pd - >>> from dabl.search import GridSuccessiveHalving >>> parameters = {'kernel': ('linear', 'rbf'), ... 'C': [1, 10, 100]} >>> base_estimator = SVC(gamma='scale') @@ -357,8 +357,8 @@ the number of candidates, the last iteration may have to evaluate more than ``ratio`` candidates.:: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC + >>> from sklearn.model_selection import GridSuccessiveHalving >>> import pandas as pd - >>> from dabl.search import GridSuccessiveHalving >>> >>> >>> parameters = {'kernel': ('linear', 'rbf'), From c9c87c3855fc29bdfe11555d24a063185db54f2d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 May 2019 16:28:19 -0400 Subject: [PATCH 09/89] add _required_parameters --- sklearn/model_selection/_search_successive_halving.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 5373e8b8ff766..32410dd8d5ed9 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -531,6 +531,7 @@ class GridSuccessiveHalving(BaseSuccessiveHalving): :class:`RandomSuccessiveHalving`: Random search over a set of parameters using successive halving. """ + _required_parameters = ["estimator", "param_grid"] def __init__(self, estimator, param_grid, scoring=None, n_jobs=None, refit=True, verbose=0, cv=5, @@ -832,6 +833,7 @@ class RandomSuccessiveHalving(BaseSuccessiveHalving): :class:`GridSuccessiveHalving`: Search over a grid of parameters using successive halving. """ + _required_parameters = ["estimator", "param_distributions"] def __init__(self, estimator, param_distributions, n_candidates='auto', scoring=None, n_jobs=None, refit=True, From 9e7fc3cdc700a450c68c562ea94f035c64266a65 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 May 2019 17:15:19 -0400 Subject: [PATCH 10/89] skipping check in rst file if pandas not installed --- doc/conftest.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/conftest.py b/doc/conftest.py index 7e229781cd32d..7dc304f8eba9d 100644 --- a/doc/conftest.py +++ b/doc/conftest.py @@ -80,6 +80,13 @@ def setup_impute(): raise SkipTest("Skipping impute.rst, pandas not installed") +def setup_grid_search(): + try: + import pandas # noqa + except ImportError: + raise SkipTest("Skipping grid_search.rst, pandas not installed") + + def setup_unsupervised_learning(): # ignore deprecation warnings from scipy.misc.face warnings.filterwarnings('ignore', 'The binary mode of fromstring', @@ -106,6 +113,8 @@ def pytest_runtest_setup(item): raise SkipTest('FeatureHasher is not compatible with PyPy') elif fname.endswith('modules/impute.rst'): setup_impute() + elif fname.endswith('modules/grid_search.rst'): + setup_grid_search() elif fname.endswith('statistical_inference/unsupervised_learning.rst'): setup_unsupervised_learning() From 81cee9b202be2c18ff0df1cba9752edbe603cb84 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 22 May 2019 11:43:18 -0400 Subject: [PATCH 11/89] Update sklearn/model_selection/_search_successive_halving.py Co-Authored-By: Joel Nothman --- sklearn/model_selection/_search_successive_halving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 32410dd8d5ed9..c8b8ab44f9919 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -560,7 +560,7 @@ class RandomSuccessiveHalving(BaseSuccessiveHalving): """Randomized search on hyper parameters. The search strategy starts evaluating all the candidates with a small - amount a resource and iteratively selects the best candidates, using more + amount of resources and iteratively selects the best candidates, using more and more resources. Read more in the :ref:`User guide`. From cbed1e35fb4e1b958813dd8d18299c9af46bd002 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 29 Jul 2019 17:14:57 -0400 Subject: [PATCH 12/89] renamed into GridHalvingSearchCV and RandomHalvingSearchCV --- doc/modules/classes.rst | 4 +-- doc/modules/grid_search.rst | 20 +++++------ .../plot_successive_halving_heatmap.py | 4 +-- .../plot_successive_halving_iterations.py | 4 +-- sklearn/model_selection/__init__.py | 8 ++--- .../_search_successive_halving.py | 10 +++--- .../tests/test_successive_halving.py | 34 +++++++++---------- 7 files changed, 42 insertions(+), 42 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 625998c2a0899..5819e4dd57d90 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1085,11 +1085,11 @@ Hyper-parameter optimizers :template: class.rst model_selection.GridSearchCV - model_selection.GridSuccessiveHalving + model_selection.GridHalvingSearchCV model_selection.ParameterGrid model_selection.ParameterSampler model_selection.RandomizedSearchCV - model_selection.RandomSuccessiveHalving + model_selection.RandomHalvingSearchCV .. autosummary:: diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 5f45c4e1cfdcc..c95ff621a4b6b 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -237,8 +237,8 @@ the search. Searching optimal parameters with successive halving ==================================================== -Scikit-learn also provides the :class:`GridSuccessiveHalving` and -:class:`RandomSuccessiveHalving` estimators that can be used to +Scikit-learn also provides the :class:`GridHalvingSearchCV` and +:class:`RandomHalvingSearchCV` estimators that can be used to search a parameter space using successive halving [1]_ [2]_. Successive halving is an iterative selection process where all candidates are evaluated with a small amount of resources at the first iteration. Only a subset of @@ -279,14 +279,14 @@ iterations of a random forest:: >>> from sklearn.datasets import make_classification >>> from sklearn.ensemble import RandomForestClassifier - >>> from sklearn.model_selection import GridSuccessiveHalving + >>> from sklearn.model_selection import GridHalvingSearchCV >>> import pandas as pd >>> >>> parameters = {'max_depth': [3, 5, 10], ... 'min_samples_split': [2, 5, 10]} >>> base_estimator = RandomForestClassifier(random_state=0) >>> X, y = make_classification(n_samples=1000, random_state=0) - >>> sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + >>> sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, ... ratio=2, ... budget_on='n_estimators', ... max_budget=30, @@ -312,13 +312,13 @@ a big budget, this may be a waste of resource:: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC - >>> from sklearn.model_selection import GridSuccessiveHalving + >>> from sklearn.model_selection import GridHalvingSearchCV >>> import pandas as pd >>> parameters = {'kernel': ('linear', 'rbf'), ... 'C': [1, 10, 100]} >>> base_estimator = SVC(gamma='scale') >>> X, y = make_classification(n_samples=1000) - >>> sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + >>> sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, ... ratio=2).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) >>> results.groupby('iter').r_i.unique() @@ -333,7 +333,7 @@ is ``n_samples=1000``. Note in this case that ``r_min = r_0 = 20``. In order for the last iteration to use as many resources as possible, you can use the ``force_exhaust_budget`` parameter:: - >>> sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + >>> sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, ... ratio=2, force_exhaust_budget=True, ... ).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) @@ -357,7 +357,7 @@ the number of candidates, the last iteration may have to evaluate more than ``ratio`` candidates.:: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC - >>> from sklearn.model_selection import GridSuccessiveHalving + >>> from sklearn.model_selection import GridHalvingSearchCV >>> import pandas as pd >>> >>> @@ -365,7 +365,7 @@ the number of candidates, the last iteration may have to evaluate more than ... 'C': [1, 10, 100]} >>> base_estimator = SVC(gamma='scale') >>> X, y = make_classification(n_samples=1000) - >>> sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + >>> sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, ... ratio=2, ... max_budget=40, ... aggressive_elimination=False, @@ -390,7 +390,7 @@ process to end up with less than ``ratio`` candidates at the last iteration. To do this, the process will eliminate as many candidates as necessary using ``r_min`` resources:: - >>> sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + >>> sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, ... ratio=2, ... max_budget=40, ... aggressive_elimination=True, diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index 5886e5d1cbdcd..c6b00805af6e3 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -11,7 +11,7 @@ from sklearn.svm import SVC from sklearn import datasets from sklearn.model_selection import GridSearchCV -from sklearn.model_selection import GridSuccessiveHalving +from sklearn.model_selection import GridHalvingSearchCV rng = np.random.RandomState(0) @@ -23,7 +23,7 @@ clf = SVC(random_state=rng) tic = time() -gsh = GridSuccessiveHalving( +gsh = GridHalvingSearchCV( estimator=clf, param_grid=param_grid, budget_on='n_samples', # budget is the number of samples diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 0603c41c536a1..244b43425e571 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -9,7 +9,7 @@ import numpy as np from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import RandomSuccessiveHalving +from sklearn.model_selection import RandomHalvingSearchCV rng = np.random.RandomState(0) @@ -24,7 +24,7 @@ "bootstrap": [True, False], "criterion": ["gini", "entropy"]} -rsh = RandomSuccessiveHalving( +rsh = RandomHalvingSearchCV( estimator=clf, param_distributions=param_dist, budget_on='n_samples', # budget is the number of samples diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index 18cb5774482f7..58f0bb196e7c1 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -29,12 +29,12 @@ from ._search import ParameterSampler from ._search import fit_grid_point -from ._search_successive_halving import GridSuccessiveHalving -from ._search_successive_halving import RandomSuccessiveHalving +from ._search_successive_halving import GridHalvingSearchCV +from ._search_successive_halving import RandomHalvingSearchCV __all__ = ('BaseCrossValidator', 'GridSearchCV', - 'GridSuccessiveHalving', + 'GridHalvingSearchCV', 'TimeSeriesSplit', 'KFold', 'GroupKFold', @@ -49,7 +49,7 @@ 'ParameterSampler', 'PredefinedSplit', 'RandomizedSearchCV', - 'RandomSuccessiveHalving', + 'RandomHalvingSearchCV', 'ShuffleSplit', 'StratifiedKFold', 'StratifiedShuffleSplit', diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index c8b8ab44f9919..bc0dad3237d01 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -12,7 +12,7 @@ from ..utils import resample -__all__ = ['GridSuccessiveHalving', 'RandomSuccessiveHalving'] +__all__ = ['GridHalvingSearchCV', 'RandomHalvingSearchCV'] def _refit_callable(results): @@ -259,7 +259,7 @@ def _generate_candidate_params(self): pass -class GridSuccessiveHalving(BaseSuccessiveHalving): +class GridHalvingSearchCV(BaseSuccessiveHalving): """Search over specified parameter values with successive halving. The search strategy starts evaluating all the candidates with a small @@ -528,7 +528,7 @@ class GridSuccessiveHalving(BaseSuccessiveHalving): See Also -------- - :class:`RandomSuccessiveHalving`: + :class:`RandomHalvingSearchCV`: Random search over a set of parameters using successive halving. """ _required_parameters = ["estimator", "param_grid"] @@ -556,7 +556,7 @@ def _generate_candidate_params(self): return ParameterGrid(self.param_grid) -class RandomSuccessiveHalving(BaseSuccessiveHalving): +class RandomHalvingSearchCV(BaseSuccessiveHalving): """Randomized search on hyper parameters. The search strategy starts evaluating all the candidates with a small @@ -830,7 +830,7 @@ class RandomSuccessiveHalving(BaseSuccessiveHalving): See Also -------- - :class:`GridSuccessiveHalving`: + :class:`GridHalvingSearchCV`: Search over a grid of parameters using successive halving. """ _required_parameters = ["estimator", "param_distributions"] diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 22ecbcc1881c5..67bf273fa204f 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -3,8 +3,8 @@ from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier -from sklearn.model_selection import GridSuccessiveHalving -from sklearn.model_selection import RandomSuccessiveHalving +from sklearn.model_selection import GridHalvingSearchCV +from sklearn.model_selection import RandomHalvingSearchCV class FastClassifier(DummyClassifier): @@ -40,7 +40,7 @@ def test_aggressive_elimination(): # aggressive_elimination=True # In this case, the first iterations only use r_min_ resources - sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=True, max_budget=max_budget, ratio=ratio) sh.fit(X, y) @@ -51,7 +51,7 @@ def test_aggressive_elimination(): assert sh.n_remaining_candidates_ == 1 # Make sure we get the same results with randomized search - sh = RandomSuccessiveHalving(base_estimator, parameters, + sh = RandomHalvingSearchCV(base_estimator, parameters, n_candidates=60, cv=5, aggressive_elimination=True, max_budget=max_budget, ratio=ratio) @@ -65,7 +65,7 @@ def test_aggressive_elimination(): # aggressive_elimination=False # In this case we don't loop at the start, and might end up with a lot of # candidates at the last iteration - sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=False, max_budget=max_budget, ratio=ratio) sh.fit(X, y) @@ -81,7 +81,7 @@ def test_aggressive_elimination(): # needed # aggressive_elimination=True - sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=True, max_budget=max_budget, ratio=ratio) sh.fit(X, y) @@ -93,7 +93,7 @@ def test_aggressive_elimination(): assert sh.n_remaining_candidates_ == 1 # aggressive_elimination=False - sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=False, max_budget=max_budget, ratio=ratio) sh.fit(X, y) @@ -117,7 +117,7 @@ def test_force_exhaust_budget_false(): ratio = 3 # with enough budget - sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, force_exhaust_budget=False, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 2 @@ -126,7 +126,7 @@ def test_force_exhaust_budget_false(): assert sh._r_i_list == [20, 60] # with enough budget but r_min!='auto': ignored - sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, force_exhaust_budget=False, ratio=ratio, r_min=50) sh.fit(X, y) @@ -136,7 +136,7 @@ def test_force_exhaust_budget_false(): assert sh._r_i_list == [50, 150] # without enough budget (budget is exhausted anyway) - sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, force_exhaust_budget=False, ratio=ratio, max_budget=30) sh.fit(X, y) @@ -167,7 +167,7 @@ def test_force_exhaust_budget_true(max_budget, r_i_list): parameters = {'a': [1, 2], 'b': [1, 2, 3]} base_estimator = FastClassifier() ratio = 3 - sh = GridSuccessiveHalving(base_estimator, parameters, cv=5, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, force_exhaust_budget=True, ratio=ratio, max_budget=max_budget) sh.fit(X, y) @@ -176,7 +176,7 @@ def test_force_exhaust_budget_true(max_budget, r_i_list): assert sh._r_i_list == r_i_list # Test same for randomized search - sh = RandomSuccessiveHalving(base_estimator, parameters, n_candidates=6, + sh = RandomHalvingSearchCV(base_estimator, parameters, n_candidates=6, cv=5, force_exhaust_budget=True, ratio=ratio, max_budget=max_budget) sh.fit(X, y) @@ -207,7 +207,7 @@ def test_n_iterations(max_budget, n_iterations, n_possible_iterations): base_estimator = FastClassifier() ratio = 2 - sh = GridSuccessiveHalving(base_estimator, parameters, cv=2, ratio=ratio, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, ratio=ratio, max_budget=max_budget, r_min=4) sh.fit(X, y) assert sh.n_required_iterations_ == 5 @@ -222,7 +222,7 @@ def test_budget_on(): X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() - sh = GridSuccessiveHalving(base_estimator, parameters, cv=2, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, budget_on='c', max_budget=10, ratio=3) sh.fit(X, y) assert set(sh._r_i_list) == set([1, 3, 9]) @@ -234,7 +234,7 @@ def test_budget_on(): with pytest.raises( ValueError, match='Cannot budget on parameter 1234 which is not supported '): - sh = GridSuccessiveHalving(base_estimator, parameters, cv=2, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, budget_on='1234', max_budget=10) sh.fit(X, y) @@ -243,7 +243,7 @@ def test_budget_on(): match='Cannot budget on parameter c since it is part of the ' 'searched parameters.'): parameters = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]} - sh = GridSuccessiveHalving(base_estimator, parameters, cv=2, + sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, budget_on='c', max_budget=10) sh.fit(X, y) @@ -264,7 +264,7 @@ def test_random_search(max_budget, n_candidates, expected_n_candidates_): X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': norm, 'b': norm} base_estimator = FastClassifier() - sh = RandomSuccessiveHalving(base_estimator, parameters, + sh = RandomHalvingSearchCV(base_estimator, parameters, n_candidates=n_candidates, cv=2, max_budget=max_budget, ratio=2, r_min=4) From 0c1fd07488af51acafbc178acf65d073e9e03a17 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 29 Jul 2019 17:39:33 -0400 Subject: [PATCH 13/89] Addressed thomas' comments --- doc/modules/grid_search.rst | 11 ++++++----- sklearn/model_selection/_search_successive_halving.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index c95ff621a4b6b..45b5a7c952109 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -331,7 +331,7 @@ a big budget, this may be a waste of resource:: The search process will only use 80 resources at most, while our maximum budget is ``n_samples=1000``. Note in this case that ``r_min = r_0 = 20``. In order for the last iteration to use as many resources as possible, you can use the -``force_exhaust_budget`` parameter:: +``force_exhaust_budget`` parameter.:: >>> sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, ... ratio=2, force_exhaust_budget=True, @@ -345,16 +345,17 @@ for the last iteration to use as many resources as possible, you can use the Name: r_i, dtype: object -Since ``force_exhaust_budget`` chooses an appropriate ``r_min`` to start -with, ``r_min`` must be set to 'auto'. +`r_min` was here automatically set to 250, which results in the last +iteration using all the budget. Since ``force_exhaust_budget`` chooses an +appropriate ``r_min`` to start with, ``r_min`` must be set to 'auto' (default). Aggressive elimination of candidates ------------------------------------ Ideally, we want the last iteration to evaluate ``ratio`` candidates. We then -just have to pick the best one. When the number budget is small with respect to +just have to pick the best one. When the budget is small with respect to the number of candidates, the last iteration may have to evaluate more than -``ratio`` candidates.:: +``ratio`` candidates:: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC >>> from sklearn.model_selection import GridHalvingSearchCV diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index bc0dad3237d01..612398bd556c5 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -167,7 +167,7 @@ def _run_search(self, evaluate_candidates, X, y, groups): self.r_min_ = max(self.r_min_, self.max_budget_ // self.ratio**last_iteration) - # n_possible iterations is the number of iterations that we can + # n_possible_iterations is the number of iterations that we can # actually do starting from r_min and without exceeding the budget. # Depending on budget size the number of candidates, this may be higher # or smaller than n_required_iterations. From 5d7085936950c0dec15fb312d1eba62efdabf47a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 30 Jul 2019 06:36:29 -0400 Subject: [PATCH 14/89] repr --- doc/modules/grid_search.rst | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 45b5a7c952109..1001df364d960 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -293,13 +293,7 @@ iterations of a random forest:: ... random_state=0, ... ).fit(X, y) >>> sh.best_estimator_ - RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', - max_depth=5, max_features='auto', max_leaf_nodes=None, - min_impurity_decrease=0.0, min_impurity_split=None, - min_samples_leaf=1, min_samples_split=2, - min_weight_fraction_leaf=0.0, n_estimators=8, - n_jobs=None, oob_score=False, random_state=0, verbose=0, - warm_start=False) + RandomForestClassifier(max_depth=5, n_estimators=8, random_state=0) Note that it is not possible to budget on a parameter that is part of the parameter space. From 55d82d01d49b83d3c770c16bb91b4bce27faabd7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 30 Jul 2019 09:25:35 -0400 Subject: [PATCH 15/89] removed passing group as a parameter to evaluate_candidates --- sklearn/model_selection/_search.py | 14 +++++++------- .../model_selection/_search_successive_halving.py | 4 ++-- sklearn/model_selection/tests/test_search.py | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 37b4d1d971a7a..b73dce0c89f58 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -546,7 +546,7 @@ def classes_(self): self._check_is_fitted("classes_") return self.best_estimator_.classes_ - def _run_search(self, evaluate_candidates, X, y, groups): + def _run_search(self, evaluate_candidates, X, y): """Repeatedly calls `evaluate_candidates` to conduct a search. This method, implemented in sub-classes, makes it possible to @@ -649,7 +649,7 @@ def fit(self, X, y=None, groups=None, **fit_params): all_out = [] all_more_results = defaultdict(list) - def evaluate_candidates(candidate_params, X, y, groups, + def evaluate_candidates(candidate_params, X, y, more_results=None): candidate_params = list(candidate_params) n_candidates = len(candidate_params) @@ -692,7 +692,7 @@ def evaluate_candidates(candidate_params, X, y, groups, return results - self._run_search(evaluate_candidates, X, y, groups) + self._run_search(evaluate_candidates, X, y) # For multi-metric evaluation, store the best_index_, best_params_ and # best_score_ iff refit is one of the scorer names @@ -1125,9 +1125,9 @@ def __init__(self, estimator, param_grid, scoring=None, self.param_grid = param_grid _check_param_grid(param_grid) - def _run_search(self, evaluate_candidates, X, y, groups): + def _run_search(self, evaluate_candidates, X, y): """Search all candidates in param_grid""" - evaluate_candidates(ParameterGrid(self.param_grid), X, y, groups) + evaluate_candidates(ParameterGrid(self.param_grid), X, y) class RandomizedSearchCV(BaseSearchCV): @@ -1453,8 +1453,8 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score) - def _run_search(self, evaluate_candidates, X, y, groups): + def _run_search(self, evaluate_candidates, X, y): """Search n_iter candidates from param_distributions""" evaluate_candidates(ParameterSampler( self.param_distributions, self.n_iter, - random_state=self.random_state), X, y, groups) + random_state=self.random_state), X, y) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 612398bd556c5..459e0a6eb775b 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -137,7 +137,7 @@ def fit(self, X, y=None, groups=None, **fit_params): self.cv_results_['mean_test_score'][self.best_index_]) return self - def _run_search(self, evaluate_candidates, X, y, groups): + def _run_search(self, evaluate_candidates, X, y): rng = check_random_state(self.random_state) candidate_params = self._generate_candidate_params() @@ -231,7 +231,7 @@ def _run_search(self, evaluate_candidates, X, y, groups): more_results = {'iter': [iter_i] * n_candidates, 'r_i': [r_i] * n_candidates} results = evaluate_candidates(candidate_params, X_iter, y_iter, - groups, more_results=more_results) + more_results=more_results) n_candidates_to_keep = ceil(n_candidates / self.ratio) candidate_params = self._top_k(results, diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 04c2fdf8a25f4..d32edd7bede90 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1661,12 +1661,12 @@ class CustomSearchCV(BaseSearchCV): def __init__(self, estimator, **kwargs): super().__init__(estimator, **kwargs) - def _run_search(self, evaluate, X, y, groups): + def _run_search(self, evaluate, X, y): results = evaluate([{'max_depth': 1}, {'max_depth': 2}], - X, y, groups) + X, y) check_results(results, fit_grid({'max_depth': [1, 2]})) results = evaluate([{'min_samples_split': 5}, - {'min_samples_split': 10}], X, y, groups) + {'min_samples_split': 10}], X, y) check_results(results, fit_grid([{'max_depth': [1, 2]}, {'min_samples_split': [5, 10]}])) From 00c99d5efd07766e7a5375b7b696a8144fbcbbbe Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 30 Jul 2019 10:54:18 -0400 Subject: [PATCH 16/89] Joels comments --- doc/modules/grid_search.rst | 75 ++++++++++++------- .../_search_successive_halving.py | 42 +++++------ 2 files changed, 70 insertions(+), 47 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 1001df364d960..189b69ac7b613 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -241,7 +241,7 @@ Scikit-learn also provides the :class:`GridHalvingSearchCV` and :class:`RandomHalvingSearchCV` estimators that can be used to search a parameter space using successive halving [1]_ [2]_. Successive halving is an iterative selection process where all candidates are evaluated -with a small amount of resources at the first iteration. Only a subset of +with a small amount of resources at the first iteration. Only some of these candidates are selected for the next iteration, which will be allocated more resources. What defines a resource is typically the number of samples to train on, or the number of trees for a gradient boosting / @@ -266,7 +266,35 @@ the next iteration:: n_candidates_to_keep = n_candidates_at_i // ratio -Note that each ``r_i`` is a multiple of both ``ratio`` and ``r_min``. +So in the first iteration, we use ``r_min`` resources ``n_candidates`` +times. In the second iteration, we use ``r_min * ratio`` resources +``n_candidates // ratio`` times. The third again multiplies the resources +per candidate and divides the number of candidates. This process stops when +the maximum budget per candidate is reached, or when less than ``ratio`` +candidates are left. + +Here is an example with ``r_min=3`` and ``ratio=2``, starting with 70 +candidates: + ++-------------+-----------------------+ +| ``r_i`` | ``n_candidates_at_i`` | ++=============+=======================+ +| 3 (=r_min) | 70 (=n_candidates) | ++-------------+-----------------------+ +| 3 * 2 = 6 | 70 // 2 = 35 | ++-------------+-----------------------+ +| 6 * 2 = 12 | 35 // 2 = 17 | ++-------------+-----------------------+ +| 12 * 2 = 24 | 17 // 2 = 8 | ++-------------+-----------------------+ +| 24 * 2 = 48 | 8 // 2 = 4 | ++-------------+-----------------------+ +| 48 * 2 = 96 | 4 // 2 = 2 | ++-------------+-----------------------+ + +At the last iteration, ``ratio`` candidates are evaluated, and we can pick +the best one. Note that each ``r_i`` is a multiple of both ``ratio`` and +``r_min``. Choosing the budget ------------------- @@ -282,40 +310,37 @@ iterations of a random forest:: >>> from sklearn.model_selection import GridHalvingSearchCV >>> import pandas as pd >>> - >>> parameters = {'max_depth': [3, 5, 10], + >>> param_grid = {'max_depth': [3, 5, 10], ... 'min_samples_split': [2, 5, 10]} >>> base_estimator = RandomForestClassifier(random_state=0) >>> X, y = make_classification(n_samples=1000, random_state=0) - >>> sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, - ... ratio=2, - ... budget_on='n_estimators', - ... max_budget=30, - ... random_state=0, - ... ).fit(X, y) + >>> sh = GridHalvingSearchCV(base_estimator, param_grid, cv=5, + ... ratio=2, budget_on='n_estimators', + ... max_budget=30, random_state=0).fit(X, y) >>> sh.best_estimator_ RandomForestClassifier(max_depth=5, n_estimators=8, random_state=0) Note that it is not possible to budget on a parameter that is part of the -parameter space. +parameter grid. Exhausting the budget --------------------- As mentioned above, the first iteration uses ``r_min`` resources. If you have -a big budget, this may be a waste of resource:: +a big budget, this may be a waste of resources:: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC >>> from sklearn.model_selection import GridHalvingSearchCV >>> import pandas as pd - >>> parameters = {'kernel': ('linear', 'rbf'), - ... 'C': [1, 10, 100]} + >>> param_grid= {'kernel': ('linear', 'rbf'), + ... 'C': [1, 10, 100]} >>> base_estimator = SVC(gamma='scale') >>> X, y = make_classification(n_samples=1000) - >>> sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, - ... ratio=2).fit(X, y) - >>> results = pd.DataFrame.from_dict(sh.cv_results_) - >>> results.groupby('iter').r_i.unique() + >>> sh = GridHalvingSearchCV(base_estimator, param_grid, cv=5, + ... ratio=2).fit(X, y) + >>> results = pd.DataFrame(sh.cv_results_) + >>> results.groupby('iter')['r_i'].unique() iter 0 [20] 1 [40] @@ -327,11 +352,11 @@ is ``n_samples=1000``. Note in this case that ``r_min = r_0 = 20``. In order for the last iteration to use as many resources as possible, you can use the ``force_exhaust_budget`` parameter.:: - >>> sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, + >>> sh = GridHalvingSearchCV(base_estimator, param_grid, cv=5, ... ratio=2, force_exhaust_budget=True, ... ).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) - >>> results.groupby('iter').r_i.unique() + >>> results.groupby('iter')['r_i'].unique() iter 0 [250] 1 [500] @@ -356,15 +381,13 @@ the number of candidates, the last iteration may have to evaluate more than >>> import pandas as pd >>> >>> - >>> parameters = {'kernel': ('linear', 'rbf'), + >>> param_grid = {'kernel': ('linear', 'rbf'), ... 'C': [1, 10, 100]} >>> base_estimator = SVC(gamma='scale') >>> X, y = make_classification(n_samples=1000) - >>> sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, - ... ratio=2, - ... max_budget=40, - ... aggressive_elimination=False, - ... ).fit(X, y) + >>> sh = GridHalvingSearchCV(base_estimator, param_grid, cv=5, + ... ratio=2, max_budget=40, + ... aggressive_elimination=False).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) >>> results.groupby('iter').r_i.unique() iter @@ -385,7 +408,7 @@ process to end up with less than ``ratio`` candidates at the last iteration. To do this, the process will eliminate as many candidates as necessary using ``r_min`` resources:: - >>> sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, + >>> sh = GridHalvingSearchCV(base_estimator, param_grid, cv=5, ... ratio=2, ... max_budget=40, ... aggressive_elimination=True, diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 459e0a6eb775b..ac58c68b8b976 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -282,12 +282,12 @@ class GridHalvingSearchCV(BaseSuccessiveHalving): in the list are explored. This enables searching over any sequence of parameter settings. - scoring : string, callable, or None, default: None + scoring : string, callable, or None, default=None A single string (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`) to evaluate the predictions on the test set. If None, the estimator's score method is used. - n_jobs : int or None, optional (default=None) + n_jobs : int or None, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -308,9 +308,9 @@ class GridHalvingSearchCV(BaseSuccessiveHalving): spawned - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' + as in '2*n_jobs' (default) - cv : int, cross-validation generator or an iterable, optional (default=5) + cv : int, cross-validation generator or iterable, default=5 Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -351,24 +351,24 @@ class GridHalvingSearchCV(BaseSuccessiveHalving): expensive and is not strictly required to select the parameters that yield the best generalization performance. - max_budget : int, optional(default='auto') + max_budget : int, default='auto' The maximum number of resources that any candidate is allowed to use for a given iteration. By default, this is set ``n_samples`` when ``budget_on='n_samples'`` (default), else an error is raised. - budget_on : `n_samples` or str, optional(default='n_samples') + budget_on : `'n_samples'` or str, default='n_samples' Defines the nature of the budget. By default, the budget is the number of samples. It can also be set to any parameter of the base estimator that accepts positive integer values, e.g. 'n_iterations' or 'n_estimators' for a gradient boosting estimator. In this case ``max_budget`` cannot be 'auto'. - ratio : int or float, optional(default=3) + ratio : int or float, default=3 The 'halving' parameter, which determines the proportion of candidates that are selected for the next iteration. For example, ``ratio=3`` means that only one third of the candidates are selected. - r_min : int, optional(default='auto') + r_min : int, default='auto' The minimum amount of resource that any candidate is allowed to use for a given iteration. Equivalently, this defines the amount of resources that are allocated for each candidate at the first iteration. By @@ -385,7 +385,7 @@ class GridHalvingSearchCV(BaseSuccessiveHalving): Note that the amount of resources used at each iteration is always a multiple of ``r_min``. - aggressive_elimination : bool, optional(default=False) + aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough budget to eliminate enough candidates at the last iteration. If ``True``, then the search process will 'replay' the first iteration for as long as @@ -393,7 +393,7 @@ class GridHalvingSearchCV(BaseSuccessiveHalving): ``False`` by default, which means that the last iteration may evaluate more than ``ratio`` candidates. - force_exhaust_budget : bool, optional(default=False) + force_exhaust_budget : bool, default=False If True, then ``r_min`` is set to a specific value such that the last iteration uses as much budget as possible. Namely, the last iteration uses the highest value smaller than ``max_budget`` that is a @@ -578,18 +578,18 @@ class RandomHalvingSearchCV(BaseSuccessiveHalving): method for sampling (such as those from scipy.stats.distributions). If a list is given, it is sampled uniformly. - n_candidates: int, optional(default='auto') + n_candidates: int, default='auto' The number of candidate parameters to sample. By default this will sample enough candidates so that the last iteration uses as many resources as possible. Note that ``force_exhaust_budget`` has no effect in this case. - scoring : string, callable, or None, default: None + scoring : string, callable, or None, default=None A single string (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`) to evaluate the predictions on the test set. If None, the estimator's score method is used. - n_jobs : int or None, optional (default=None) + n_jobs : int or None, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -610,9 +610,9 @@ class RandomHalvingSearchCV(BaseSuccessiveHalving): spawned - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' + as in '2*n_jobs' (default) - cv : int, cross-validation generator or an iterable, optional (default=5) + cv : int, cross-validation generator or an iterable, default=5 Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -653,24 +653,24 @@ class RandomHalvingSearchCV(BaseSuccessiveHalving): expensive and is not strictly required to select the parameters that yield the best generalization performance. - max_budget : int, optional(default='auto') + max_budget : int, default='auto' The maximum number of resources that any candidate is allowed to use for a given iteration. By default, this is set ``n_samples`` when ``budget_on='n_samples'`` (default), else an error is raised. - budget_on : `n_samples` or str, optional(default='n_samples') + budget_on : ``'n_samples'`` or str, default='n_samples' Defines the nature of the budget. By default, the budget is the number of samples. It can also be set to any parameter of the base estimator that accepts positive integer values, e.g. 'n_iterations' or 'n_estimators' for a gradient boosting estimator. In this case ``max_budget`` cannot be 'auto'. - ratio : int or float, optional(default=3) + ratio : int or float, default=3 The 'halving' parameter, which determines the proportion of candidates that are selected for the next iteration. For example, ``ratio=3`` means that only one third of the candidates are selected. - r_min : int, optional(default='auto') + r_min : int, default='auto' The minimum amount of resource that any candidate is allowed to use for a given iteration. Equivalently, this defines the amount of resources that are allocated for each candidate at the first iteration. By @@ -687,7 +687,7 @@ class RandomHalvingSearchCV(BaseSuccessiveHalving): Note that the amount of resources used at each iteration is always a multiple of ``r_min``. - aggressive_elimination : bool, optional(default=False) + aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough budget to eliminate enough candidates at the last iteration. If ``True``, then the search process will 'replay' the first iteration for as long as @@ -695,7 +695,7 @@ class RandomHalvingSearchCV(BaseSuccessiveHalving): ``False`` by default, which means that the last iteration may evaluate more than ``ratio`` candidates. - force_exhaust_budget : bool, optional(default=False) + force_exhaust_budget : bool, default=False If True, then ``r_min`` is set to a specific value such that the last iteration uses as much budget as possible. Namely, the last iteration uses the highest value smaller than ``max_budget`` that is a From 40d36dba0ab70b97709d5f6f5b91312be5a78c9f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 30 Jul 2019 10:56:25 -0400 Subject: [PATCH 17/89] pep8 --- .../tests/test_successive_halving.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 67bf273fa204f..c264b2a3d8108 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -41,8 +41,8 @@ def test_aggressive_elimination(): # aggressive_elimination=True # In this case, the first iterations only use r_min_ resources sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, - aggressive_elimination=True, - max_budget=max_budget, ratio=ratio) + aggressive_elimination=True, + max_budget=max_budget, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 4 assert sh.n_required_iterations_ == 4 @@ -52,9 +52,9 @@ def test_aggressive_elimination(): # Make sure we get the same results with randomized search sh = RandomHalvingSearchCV(base_estimator, parameters, - n_candidates=60, cv=5, - aggressive_elimination=True, - max_budget=max_budget, ratio=ratio) + n_candidates=60, cv=5, + aggressive_elimination=True, + max_budget=max_budget, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 4 assert sh.n_required_iterations_ == 4 @@ -66,8 +66,8 @@ def test_aggressive_elimination(): # In this case we don't loop at the start, and might end up with a lot of # candidates at the last iteration sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, - aggressive_elimination=False, - max_budget=max_budget, ratio=ratio) + aggressive_elimination=False, + max_budget=max_budget, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 3 @@ -82,8 +82,8 @@ def test_aggressive_elimination(): # aggressive_elimination=True sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, - aggressive_elimination=True, - max_budget=max_budget, ratio=ratio) + aggressive_elimination=True, + max_budget=max_budget, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 4 @@ -94,8 +94,8 @@ def test_aggressive_elimination(): # aggressive_elimination=False sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, - aggressive_elimination=False, - max_budget=max_budget, ratio=ratio) + aggressive_elimination=False, + max_budget=max_budget, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 4 @@ -118,7 +118,7 @@ def test_force_exhaust_budget_false(): # with enough budget sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, - force_exhaust_budget=False, ratio=ratio) + force_exhaust_budget=False, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 2 assert sh.n_required_iterations_ == 2 @@ -127,8 +127,8 @@ def test_force_exhaust_budget_false(): # with enough budget but r_min!='auto': ignored sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, - force_exhaust_budget=False, ratio=ratio, - r_min=50) + force_exhaust_budget=False, ratio=ratio, + r_min=50) sh.fit(X, y) assert sh.n_iterations_ == 2 assert sh.n_required_iterations_ == 2 @@ -137,8 +137,8 @@ def test_force_exhaust_budget_false(): # without enough budget (budget is exhausted anyway) sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, - force_exhaust_budget=False, ratio=ratio, - max_budget=30) + force_exhaust_budget=False, ratio=ratio, + max_budget=30) sh.fit(X, y) assert sh.n_iterations_ == 1 assert sh.n_required_iterations_ == 2 @@ -168,8 +168,8 @@ def test_force_exhaust_budget_true(max_budget, r_i_list): base_estimator = FastClassifier() ratio = 3 sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, - force_exhaust_budget=True, ratio=ratio, - max_budget=max_budget) + force_exhaust_budget=True, ratio=ratio, + max_budget=max_budget) sh.fit(X, y) assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh._r_i_list) @@ -177,8 +177,8 @@ def test_force_exhaust_budget_true(max_budget, r_i_list): # Test same for randomized search sh = RandomHalvingSearchCV(base_estimator, parameters, n_candidates=6, - cv=5, force_exhaust_budget=True, - ratio=ratio, max_budget=max_budget) + cv=5, force_exhaust_budget=True, + ratio=ratio, max_budget=max_budget) sh.fit(X, y) assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh._r_i_list) @@ -208,7 +208,7 @@ def test_n_iterations(max_budget, n_iterations, n_possible_iterations): ratio = 2 sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, ratio=ratio, - max_budget=max_budget, r_min=4) + max_budget=max_budget, r_min=4) sh.fit(X, y) assert sh.n_required_iterations_ == 5 assert sh.n_iterations_ == n_iterations @@ -223,7 +223,7 @@ def test_budget_on(): parameters = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, - budget_on='c', max_budget=10, ratio=3) + budget_on='c', max_budget=10, ratio=3) sh.fit(X, y) assert set(sh._r_i_list) == set([1, 3, 9]) for r_i, params, param_c in zip(sh.cv_results_['r_i'], @@ -235,7 +235,7 @@ def test_budget_on(): ValueError, match='Cannot budget on parameter 1234 which is not supported '): sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, - budget_on='1234', max_budget=10) + budget_on='1234', max_budget=10) sh.fit(X, y) with pytest.raises( @@ -244,7 +244,7 @@ def test_budget_on(): 'searched parameters.'): parameters = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]} sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, - budget_on='c', max_budget=10) + budget_on='c', max_budget=10) sh.fit(X, y) @@ -265,9 +265,9 @@ def test_random_search(max_budget, n_candidates, expected_n_candidates_): parameters = {'a': norm, 'b': norm} base_estimator = FastClassifier() sh = RandomHalvingSearchCV(base_estimator, parameters, - n_candidates=n_candidates, - cv=2, - max_budget=max_budget, ratio=2, r_min=4) + n_candidates=n_candidates, + cv=2, + max_budget=max_budget, ratio=2, r_min=4) sh.fit(X, y) assert sh.n_candidates_ == expected_n_candidates_ if n_candidates == 'auto': From 76624768d3d4ab37f2a438c04bf6dfb55e9413a9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 30 Jul 2019 13:00:37 -0400 Subject: [PATCH 18/89] reorganized user user guide --- doc/modules/grid_search.rst | 173 +++++++++--------- .../_search_successive_halving.py | 4 +- 2 files changed, 88 insertions(+), 89 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 189b69ac7b613..5716a2eef1469 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -30,14 +30,18 @@ A search consists of: - a cross-validation scheme; and - a :ref:`score function `. -Some models allow for specialized, efficient parameter search strategies, -:ref:`outlined below `. -Two generic approaches to sampling search candidates are provided in +Two generic approaches to parameter search are provided in scikit-learn: for given values, :class:`GridSearchCV` exhaustively considers all parameter combinations, while :class:`RandomizedSearchCV` can sample a given number of candidates from a parameter space with a specified -distribution. After describing these tools we detail -:ref:`best practice ` applicable to both approaches. +distribution. Both these tools have successive halving counterparts +:class:`GridHalvingSearchCV` and :class:`RandomHalvingSearchCV`, which can be +much faster at finding a good parameter combination. + +After describing these tools we detail, :ref:`best practices +` applicable to these approaches. Some models allow for +specialized, efficient parameter search strategies, outlined in +:ref:`alternative_cv`. Note that it is common that a small subset of those parameters can have a large impact on the predictive or computation performance of the model while others @@ -150,88 +154,6 @@ increasing ``n_iter`` will always lead to a finer search. Random search for hyper-parameter optimization, The Journal of Machine Learning Research (2012) -.. _grid_search_tips: - -Tips for parameter search -========================= - -.. _gridsearch_scoring: - -Specifying an objective metric ------------------------------- - -By default, parameter search uses the ``score`` function of the estimator -to evaluate a parameter setting. These are the -:func:`sklearn.metrics.accuracy_score` for classification and -:func:`sklearn.metrics.r2_score` for regression. For some applications, -other scoring functions are better suited (for example in unbalanced -classification, the accuracy score is often uninformative). An alternative -scoring function can be specified via the ``scoring`` parameter to -:class:`GridSearchCV`, :class:`RandomizedSearchCV` and many of the -specialized cross-validation tools described below. -See :ref:`scoring_parameter` for more details. - -.. _multimetric_grid_search: - -Specifying multiple metrics for evaluation ------------------------------------------- - -``GridSearchCV`` and ``RandomizedSearchCV`` allow specifying multiple metrics -for the ``scoring`` parameter. - -Multimetric scoring can either be specified as a list of strings of predefined -scores names or a dict mapping the scorer name to the scorer function and/or -the predefined scorer name(s). See :ref:`multimetric_scoring` for more details. - -When specifying multiple metrics, the ``refit`` parameter must be set to the -metric (string) for which the ``best_params_`` will be found and used to build -the ``best_estimator_`` on the whole dataset. If the search should not be -refit, set ``refit=False``. Leaving refit to the default value ``None`` will -result in an error when using multiple metrics. - -See :ref:`sphx_glr_auto_examples_model_selection_plot_multi_metric_evaluation.py` -for an example usage. - -Composite estimators and parameter spaces ------------------------------------------ - -:ref:`pipeline` describes building composite estimators whose -parameter space can be searched with these tools. - -Model selection: development and evaluation -------------------------------------------- - -Model selection by evaluating various parameter settings can be seen as a way -to use the labeled data to "train" the parameters of the grid. - -When evaluating the resulting model it is important to do it on -held-out samples that were not seen during the grid search process: -it is recommended to split the data into a **development set** (to -be fed to the ``GridSearchCV`` instance) and an **evaluation set** -to compute performance metrics. - -This can be done by using the :func:`train_test_split` -utility function. - -Parallelism ------------ - -:class:`GridSearchCV` and :class:`RandomizedSearchCV` evaluate each parameter -setting independently. Computations can be run in parallel if your OS -supports it, by using the keyword ``n_jobs=-1``. See function signature for -more details. - -Robustness to failure ---------------------- - -Some parameter settings may result in a failure to ``fit`` one or more folds -of the data. By default, this will cause the entire search to fail, even if -some parameter settings could be fully evaluated. Setting ``error_score=0`` -(or `=np.NaN`) will make the procedure robust to such failure, issuing a -warning and setting the score for that fold to 0 (or `NaN`), but completing -the search. - - .. _successive_halving_user_guide: Searching optimal parameters with successive halving @@ -443,6 +365,83 @@ eliminated enough candidates during the first iterations, using ``r_i = r_min = `_, in Machine Learning Research 18, 2018. +.. _grid_search_tips: + +Tips for parameter search +========================= + +.. _gridsearch_scoring: + +Specifying an objective metric +------------------------------ + +By default, parameter search uses the ``score`` function of the estimator +to evaluate a parameter setting. These are the +:func:`sklearn.metrics.accuracy_score` for classification and +:func:`sklearn.metrics.r2_score` for regression. For some applications, +other scoring functions are better suited (for example in unbalanced +classification, the accuracy score is often uninformative). An alternative +scoring function can be specified via the ``scoring`` parameter of most +parameter search tools. See :ref:`scoring_parameter` for more details. + +.. _multimetric_grid_search: + +Specifying multiple metrics for evaluation +------------------------------------------ + +:class:`GridSearchCV` and :class:`RandomizedSearchCV` allow specifying +multiple metrics for the ``scoring`` parameter. + +Multimetric scoring can either be specified as a list of strings of predefined +scores names or a dict mapping the scorer name to the scorer function and/or +the predefined scorer name(s). See :ref:`multimetric_scoring` for more details. + +When specifying multiple metrics, the ``refit`` parameter must be set to the +metric (string) for which the ``best_params_`` will be found and used to build +the ``best_estimator_`` on the whole dataset. If the search should not be +refit, set ``refit=False``. Leaving refit to the default value ``None`` will +result in an error when using multiple metrics. + +See :ref:`sphx_glr_auto_examples_model_selection_plot_multi_metric_evaluation.py` +for an example usage. + +Composite estimators and parameter spaces +----------------------------------------- + +Please refer to :ref:`pipeline` for performing parameter searches over +pipelines. + +Model selection: development and evaluation +------------------------------------------- + +Model selection by evaluating various parameter settings can be seen as a way +to use the labeled data to "train" the parameters of the grid. + +When evaluating the resulting model it is important to do it on +held-out samples that were not seen during the grid search process: +it is recommended to split the data into a **development set** (to +be fed to the :class:`GridSearchCV` instance) and an **evaluation set** +to compute performance metrics. + +This can be done by using the :func:`train_test_split` +utility function. + +Parallelism +----------- + +The parameter search tools evaluate each parameter setting independently. +Computations can be run in parallel if your OS supports it, by using the +keyword ``n_jobs=-1``. See function signature for more details. + +Robustness to failure +--------------------- + +Some parameter settings may result in a failure to ``fit`` one or more folds +of the data. By default, this will cause the entire search to fail, even if +some parameter settings could be fully evaluated. Setting ``error_score=0`` +(or `=np.NaN`) will make the procedure robust to such failure, issuing a +warning and setting the score for that fold to 0 (or `NaN`), but completing +the search. .. _alternative_cv: diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index ac58c68b8b976..a900852a8166b 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -263,8 +263,8 @@ class GridHalvingSearchCV(BaseSuccessiveHalving): """Search over specified parameter values with successive halving. The search strategy starts evaluating all the candidates with a small - amount a resource and iteratively selects the best candidates, using more - and more resources. + amount of resources and iteratively selects the best candidates, using + more and more resources. Read more in the :ref:`User guide`. From 00df22de69557b28cf7d9f7daee6c022109ba402 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 31 Jul 2019 12:18:37 -0400 Subject: [PATCH 19/89] renaming --- doc/modules/grid_search.rst | 44 +++++++++---------- .../plot_successive_halving_heatmap.py | 4 +- .../plot_successive_halving_iterations.py | 4 +- sklearn/model_selection/__init__.py | 4 +- .../_search_successive_halving.py | 2 +- .../tests/test_successive_halving.py | 34 +++++++------- 6 files changed, 46 insertions(+), 46 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 5716a2eef1469..00f3e8a0fc2c8 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -35,7 +35,7 @@ scikit-learn: for given values, :class:`GridSearchCV` exhaustively considers all parameter combinations, while :class:`RandomizedSearchCV` can sample a given number of candidates from a parameter space with a specified distribution. Both these tools have successive halving counterparts -:class:`GridHalvingSearchCV` and :class:`RandomHalvingSearchCV`, which can be +:class:`HalvingGridSearchCV` and :class:`HalvingRandomSearchCV`, which can be much faster at finding a good parameter combination. After describing these tools we detail, :ref:`best practices @@ -159,15 +159,15 @@ increasing ``n_iter`` will always lead to a finer search. Searching optimal parameters with successive halving ==================================================== -Scikit-learn also provides the :class:`GridHalvingSearchCV` and -:class:`RandomHalvingSearchCV` estimators that can be used to +Scikit-learn also provides the :class:`HalvingGridSearchCV` and +:class:`HalvingRandomSearchCV` estimators that can be used to search a parameter space using successive halving [1]_ [2]_. Successive halving is an iterative selection process where all candidates are evaluated with a small amount of resources at the first iteration. Only some of these candidates are selected for the next iteration, which will be allocated more resources. What defines a resource is typically the number of -samples to train on, or the number of trees for a gradient boosting / -decision forest estimator. +samples to train on, or the number of iterations in iterative algorithms +like gradient boosting. As illustrated in the figure below, only a small subset of candidates 'survive' until the last iteration. These are the candidates that have consistently been @@ -192,8 +192,8 @@ So in the first iteration, we use ``r_min`` resources ``n_candidates`` times. In the second iteration, we use ``r_min * ratio`` resources ``n_candidates // ratio`` times. The third again multiplies the resources per candidate and divides the number of candidates. This process stops when -the maximum budget per candidate is reached, or when less than ``ratio`` -candidates are left. +the maximum amount of resource per candidate is reached, or when less than +``ratio`` candidates are left. Here is an example with ``r_min=3`` and ``ratio=2``, starting with 70 candidates: @@ -218,25 +218,25 @@ At the last iteration, ``ratio`` candidates are evaluated, and we can pick the best one. Note that each ``r_i`` is a multiple of both ``ratio`` and ``r_min``. -Choosing the budget -------------------- +Choosing a resource to budget +----------------------------- -By default, the budget is defined as the number of samples. That is, each -iteration will use an increasing amount of samples to train on. You can however -manually specify a parameter to use as the budget with the ``budget_on`` -parameter. Here is an example where the budget is defined as the number of -iterations of a random forest:: +By default, the budget is defined in terms of number of samples. That is, +each iteration will use an increasing amount of samples to train on. You can +however manually specify a parameter to use as the budget with the +``budget_on`` parameter. Here is an example where the budget is defined in +terms of the number of estimators of a random forest:: >>> from sklearn.datasets import make_classification >>> from sklearn.ensemble import RandomForestClassifier - >>> from sklearn.model_selection import GridHalvingSearchCV + >>> from sklearn.model_selection import HalvingGridSearchCV >>> import pandas as pd >>> >>> param_grid = {'max_depth': [3, 5, 10], ... 'min_samples_split': [2, 5, 10]} >>> base_estimator = RandomForestClassifier(random_state=0) >>> X, y = make_classification(n_samples=1000, random_state=0) - >>> sh = GridHalvingSearchCV(base_estimator, param_grid, cv=5, + >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, ... ratio=2, budget_on='n_estimators', ... max_budget=30, random_state=0).fit(X, y) >>> sh.best_estimator_ @@ -253,13 +253,13 @@ a big budget, this may be a waste of resources:: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC - >>> from sklearn.model_selection import GridHalvingSearchCV + >>> from sklearn.model_selection import HalvingGridSearchCV >>> import pandas as pd >>> param_grid= {'kernel': ('linear', 'rbf'), ... 'C': [1, 10, 100]} >>> base_estimator = SVC(gamma='scale') >>> X, y = make_classification(n_samples=1000) - >>> sh = GridHalvingSearchCV(base_estimator, param_grid, cv=5, + >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, ... ratio=2).fit(X, y) >>> results = pd.DataFrame(sh.cv_results_) >>> results.groupby('iter')['r_i'].unique() @@ -274,7 +274,7 @@ is ``n_samples=1000``. Note in this case that ``r_min = r_0 = 20``. In order for the last iteration to use as many resources as possible, you can use the ``force_exhaust_budget`` parameter.:: - >>> sh = GridHalvingSearchCV(base_estimator, param_grid, cv=5, + >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, ... ratio=2, force_exhaust_budget=True, ... ).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) @@ -299,7 +299,7 @@ the number of candidates, the last iteration may have to evaluate more than ``ratio`` candidates:: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC - >>> from sklearn.model_selection import GridHalvingSearchCV + >>> from sklearn.model_selection import HalvingGridSearchCV >>> import pandas as pd >>> >>> @@ -307,7 +307,7 @@ the number of candidates, the last iteration may have to evaluate more than ... 'C': [1, 10, 100]} >>> base_estimator = SVC(gamma='scale') >>> X, y = make_classification(n_samples=1000) - >>> sh = GridHalvingSearchCV(base_estimator, param_grid, cv=5, + >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, ... ratio=2, max_budget=40, ... aggressive_elimination=False).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) @@ -330,7 +330,7 @@ process to end up with less than ``ratio`` candidates at the last iteration. To do this, the process will eliminate as many candidates as necessary using ``r_min`` resources:: - >>> sh = GridHalvingSearchCV(base_estimator, param_grid, cv=5, + >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, ... ratio=2, ... max_budget=40, ... aggressive_elimination=True, diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index c6b00805af6e3..efd800b58edf3 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -11,7 +11,7 @@ from sklearn.svm import SVC from sklearn import datasets from sklearn.model_selection import GridSearchCV -from sklearn.model_selection import GridHalvingSearchCV +from sklearn.model_selection import HalvingGridSearchCV rng = np.random.RandomState(0) @@ -23,7 +23,7 @@ clf = SVC(random_state=rng) tic = time() -gsh = GridHalvingSearchCV( +gsh = HalvingGridSearchCV( estimator=clf, param_grid=param_grid, budget_on='n_samples', # budget is the number of samples diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 244b43425e571..ffbfe0653988d 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -9,7 +9,7 @@ import numpy as np from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import RandomHalvingSearchCV +from sklearn.model_selection import HalvingRandomSearchCV rng = np.random.RandomState(0) @@ -24,7 +24,7 @@ "bootstrap": [True, False], "criterion": ["gini", "entropy"]} -rsh = RandomHalvingSearchCV( +rsh = HalvingRandomSearchCV( estimator=clf, param_distributions=param_dist, budget_on='n_samples', # budget is the number of samples diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index 58f0bb196e7c1..365f24274da69 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -29,8 +29,8 @@ from ._search import ParameterSampler from ._search import fit_grid_point -from ._search_successive_halving import GridHalvingSearchCV -from ._search_successive_halving import RandomHalvingSearchCV +from ._search_successive_halving import HalvingGridSearchCV +from ._search_successive_halving import HalvingRandomSearchCV __all__ = ('BaseCrossValidator', 'GridSearchCV', diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index a900852a8166b..fcd244d4b54a1 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -356,7 +356,7 @@ class GridHalvingSearchCV(BaseSuccessiveHalving): for a given iteration. By default, this is set ``n_samples`` when ``budget_on='n_samples'`` (default), else an error is raised. - budget_on : `'n_samples'` or str, default='n_samples' + budget_on : ``'n_samples'`` or str, default='n_samples' Defines the nature of the budget. By default, the budget is the number of samples. It can also be set to any parameter of the base estimator that accepts positive integer values, e.g. 'n_iterations' or diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index c264b2a3d8108..b818af4879499 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -3,8 +3,8 @@ from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier -from sklearn.model_selection import GridHalvingSearchCV -from sklearn.model_selection import RandomHalvingSearchCV +from sklearn.model_selection import HalvingGridSearchCV +from sklearn.model_selection import HalvingRandomSearchCV class FastClassifier(DummyClassifier): @@ -40,7 +40,7 @@ def test_aggressive_elimination(): # aggressive_elimination=True # In this case, the first iterations only use r_min_ resources - sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=True, max_budget=max_budget, ratio=ratio) sh.fit(X, y) @@ -51,7 +51,7 @@ def test_aggressive_elimination(): assert sh.n_remaining_candidates_ == 1 # Make sure we get the same results with randomized search - sh = RandomHalvingSearchCV(base_estimator, parameters, + sh = HalvingRandomSearchCV(base_estimator, parameters, n_candidates=60, cv=5, aggressive_elimination=True, max_budget=max_budget, ratio=ratio) @@ -65,7 +65,7 @@ def test_aggressive_elimination(): # aggressive_elimination=False # In this case we don't loop at the start, and might end up with a lot of # candidates at the last iteration - sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=False, max_budget=max_budget, ratio=ratio) sh.fit(X, y) @@ -81,7 +81,7 @@ def test_aggressive_elimination(): # needed # aggressive_elimination=True - sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=True, max_budget=max_budget, ratio=ratio) sh.fit(X, y) @@ -93,7 +93,7 @@ def test_aggressive_elimination(): assert sh.n_remaining_candidates_ == 1 # aggressive_elimination=False - sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=False, max_budget=max_budget, ratio=ratio) sh.fit(X, y) @@ -117,7 +117,7 @@ def test_force_exhaust_budget_false(): ratio = 3 # with enough budget - sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, force_exhaust_budget=False, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 2 @@ -126,7 +126,7 @@ def test_force_exhaust_budget_false(): assert sh._r_i_list == [20, 60] # with enough budget but r_min!='auto': ignored - sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, force_exhaust_budget=False, ratio=ratio, r_min=50) sh.fit(X, y) @@ -136,7 +136,7 @@ def test_force_exhaust_budget_false(): assert sh._r_i_list == [50, 150] # without enough budget (budget is exhausted anyway) - sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, force_exhaust_budget=False, ratio=ratio, max_budget=30) sh.fit(X, y) @@ -167,7 +167,7 @@ def test_force_exhaust_budget_true(max_budget, r_i_list): parameters = {'a': [1, 2], 'b': [1, 2, 3]} base_estimator = FastClassifier() ratio = 3 - sh = GridHalvingSearchCV(base_estimator, parameters, cv=5, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, force_exhaust_budget=True, ratio=ratio, max_budget=max_budget) sh.fit(X, y) @@ -176,7 +176,7 @@ def test_force_exhaust_budget_true(max_budget, r_i_list): assert sh._r_i_list == r_i_list # Test same for randomized search - sh = RandomHalvingSearchCV(base_estimator, parameters, n_candidates=6, + sh = HalvingRandomSearchCV(base_estimator, parameters, n_candidates=6, cv=5, force_exhaust_budget=True, ratio=ratio, max_budget=max_budget) sh.fit(X, y) @@ -207,7 +207,7 @@ def test_n_iterations(max_budget, n_iterations, n_possible_iterations): base_estimator = FastClassifier() ratio = 2 - sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, ratio=ratio, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, ratio=ratio, max_budget=max_budget, r_min=4) sh.fit(X, y) assert sh.n_required_iterations_ == 5 @@ -222,7 +222,7 @@ def test_budget_on(): X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() - sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, budget_on='c', max_budget=10, ratio=3) sh.fit(X, y) assert set(sh._r_i_list) == set([1, 3, 9]) @@ -234,7 +234,7 @@ def test_budget_on(): with pytest.raises( ValueError, match='Cannot budget on parameter 1234 which is not supported '): - sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, budget_on='1234', max_budget=10) sh.fit(X, y) @@ -243,7 +243,7 @@ def test_budget_on(): match='Cannot budget on parameter c since it is part of the ' 'searched parameters.'): parameters = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]} - sh = GridHalvingSearchCV(base_estimator, parameters, cv=2, + sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, budget_on='c', max_budget=10) sh.fit(X, y) @@ -264,7 +264,7 @@ def test_random_search(max_budget, n_candidates, expected_n_candidates_): X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': norm, 'b': norm} base_estimator = FastClassifier() - sh = RandomHalvingSearchCV(base_estimator, parameters, + sh = HalvingRandomSearchCV(base_estimator, parameters, n_candidates=n_candidates, cv=2, max_budget=max_budget, ratio=2, r_min=4) From 1b1554e2f34325418dc033c71fa944cb6c04857a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 31 Jul 2019 14:51:23 -0400 Subject: [PATCH 20/89] update user guide --- doc/modules/classes.rst | 4 +- doc/modules/grid_search.rst | 90 ++++++++++++++++--- sklearn/model_selection/__init__.py | 4 +- .../_search_successive_halving.py | 6 +- 4 files changed, 85 insertions(+), 19 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 5819e4dd57d90..28602c85fbcfa 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1085,11 +1085,11 @@ Hyper-parameter optimizers :template: class.rst model_selection.GridSearchCV - model_selection.GridHalvingSearchCV + model_selection.HalvingGridSearchCV model_selection.ParameterGrid model_selection.ParameterSampler model_selection.RandomizedSearchCV - model_selection.RandomHalvingSearchCV + model_selection.HalvingRandomSearchCV .. autosummary:: diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 00f3e8a0fc2c8..7c3c4224742bf 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -162,23 +162,83 @@ Searching optimal parameters with successive halving Scikit-learn also provides the :class:`HalvingGridSearchCV` and :class:`HalvingRandomSearchCV` estimators that can be used to search a parameter space using successive halving [1]_ [2]_. Successive -halving is an iterative selection process where all candidates are evaluated -with a small amount of resources at the first iteration. Only some of -these candidates are selected for the next iteration, which will be -allocated more resources. What defines a resource is typically the number of -samples to train on, or the number of iterations in iterative algorithms -like gradient boosting. +halving is an iterative selection process where all candidates (the +parameter combinations) are evaluated with a small amount of resources at +the first iteration. Only some of these candidates are selected for the next +iteration, which will be allocated more resources. What defines a resource +is typically the number of samples to train on, or the number of iterations +in iterative algorithms like gradient boosting. As illustrated in the figure below, only a small subset of candidates 'survive' until the last iteration. These are the candidates that have consistently been -part of the best candidates across all iterations. +part of the best candidates across all iterations. Each iteration is allocated +an increasing amount of resources, here the number of samples. .. figure:: ../auto_examples/svm/images/sphx_glr_plot_successive_halving_iterations_001.png :target: ../auto_examples/model_selection/plot_successive_halving_iterations.html :align: center -The amount of resources ``r_i`` allocated for each candidate at iteration -``i`` is controlled by the parameters ``ratio`` and ``r_min`` as follows:: +The ``ratio`` parameter controls the rate at which the resources grow, and +the rate at which the number of candidate decreases (more details in +:ref:`amount_of_resource_and_number_of_candidates`) + + +Choosing ``r_min`` and the number of candidates +----------------------------------------------- + +Beside ``ratio``, the two main parameters that influence the behaviour of a +successive halving search are the ``r_min`` parameter, and the number of +candidates (or parameter combinations) that are evaluated. ``r_min`` is the +amount of resources allocated at the first iteration for each candidate. The +number of candidates is specified directly in +:class:`HalvingRandomSearchCV`, and is determined from the ``param_grid`` +parameter of :class:`HalvingGridSearchCV`. + +Consider a case where we have 1000 samples. With ``r_min=10`` and +``ratio=2`` we are able to run 7 iterations, with the following number of +samples: ``[10, 20, 40, 80, 160, 320, 640]``. + +If we start with a high number of candidates, we might end up with a lot of +candidates at the last iteration. On the other hand if we start with a small +number of candidates, the last iteration might use less than 640 samples +which is a waste of resources. + +In the case of :class:`HalvingGridSearchCV`, the number of candidates is set +by default such that the maximum amount of resources is used at the last +iteration. + +Changing the value of ``r_min`` will impact the number of possible +iterations, and as a result will also have an effect on the ideal number of +candidates. + +Another consideration when choosing ``r_min`` is whether or not it is easy +to discriminate between good and bad candidates with a small amount of +resources. For example, if you need a lot of samples to distinguish between +good and bad parameters, a high ``r_min`` (possibly with the use of +``aggressive_elimination=True``) is recommended. On the other hand if the +distinction is clear even with a small amount of samples, then a small +``r_min`` may be preferable since it would speed up the computation. + +By default, ``r_min`` is set to a small value (see docstrings for details) +that depends on the number of folds, and the number of classes for +classification problems. Depending on the setting, the default valueof +``r_min`` might not be ideal. + +.. note:: + Notice in the example above that the last iteration does not use the + maximum amount of resources available: 1000 samples are available, yet + only 640 are used. Using ``force_exhaust_budget=True`` will set ``r_min`` + to a specific value such that the last iteration uses as many samples as + possible. Please see :ref:`exhausting_the_budget` for details. + +.. _amount_of_resource_and_number_of_candidates: + +Amount of resource and number of candidates at each iteration +------------------------------------------------------------- + +The amount of resources ``r_i`` (e.g. the number of samples) allocated for +each candidate at iteration ``i`` is controlled by the parameters ``ratio`` +and ``r_min`` as follows:: r_i = ratio**i * r_min @@ -214,9 +274,13 @@ candidates: | 48 * 2 = 96 | 4 // 2 = 2 | +-------------+-----------------------+ -At the last iteration, ``ratio`` candidates are evaluated, and we can pick -the best one. Note that each ``r_i`` is a multiple of both ``ratio`` and -``r_min``. +Ideally, at the last iteration, ``ratio`` candidates are evaluated, and we +can pick the best one. Note that each ``r_i`` is a multiple of both +``ratio`` and ``r_min``. + +The amount of resource that is used at each iteration can be found using the +`cv_results_` after converting it to a dataframe: +`results.groupby('iter')['r_i'].unique()` Choosing a resource to budget ----------------------------- @@ -245,6 +309,8 @@ terms of the number of estimators of a random forest:: Note that it is not possible to budget on a parameter that is part of the parameter grid. +.. _exhausting_the_budget: + Exhausting the budget --------------------- diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index 365f24274da69..70b671c446d72 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -34,7 +34,7 @@ __all__ = ('BaseCrossValidator', 'GridSearchCV', - 'GridHalvingSearchCV', + 'HalvingGridSearchCV', 'TimeSeriesSplit', 'KFold', 'GroupKFold', @@ -49,7 +49,7 @@ 'ParameterSampler', 'PredefinedSplit', 'RandomizedSearchCV', - 'RandomHalvingSearchCV', + 'HalvingRandomSearchCV', 'ShuffleSplit', 'StratifiedKFold', 'StratifiedShuffleSplit', diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index fcd244d4b54a1..885e8110be7ef 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -12,7 +12,7 @@ from ..utils import resample -__all__ = ['GridHalvingSearchCV', 'RandomHalvingSearchCV'] +__all__ = ['HalvingGridSearchCV', 'HalvingRandomSearchCV'] def _refit_callable(results): @@ -259,7 +259,7 @@ def _generate_candidate_params(self): pass -class GridHalvingSearchCV(BaseSuccessiveHalving): +class HalvingGridSearchCV(BaseSuccessiveHalving): """Search over specified parameter values with successive halving. The search strategy starts evaluating all the candidates with a small @@ -556,7 +556,7 @@ def _generate_candidate_params(self): return ParameterGrid(self.param_grid) -class RandomHalvingSearchCV(BaseSuccessiveHalving): +class HalvingRandomSearchCV(BaseSuccessiveHalving): """Randomized search on hyper parameters. The search strategy starts evaluating all the candidates with a small From 3ff395e9358042b39b97f8867fee9af52ff90e4d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 31 Jul 2019 16:34:52 -0400 Subject: [PATCH 21/89] remove groups support + pass fit_params --- sklearn/model_selection/_search.py | 9 ++-- .../_search_successive_halving.py | 46 ++++++++++++++++--- .../tests/test_successive_halving.py | 11 +++++ 3 files changed, 55 insertions(+), 11 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index b73dce0c89f58..6c26efafdc03c 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -650,7 +650,8 @@ def fit(self, X, y=None, groups=None, **fit_params): all_more_results = defaultdict(list) def evaluate_candidates(candidate_params, X, y, - more_results=None): + more_results=None, + **fit_params): candidate_params = list(candidate_params) n_candidates = len(candidate_params) @@ -692,7 +693,7 @@ def evaluate_candidates(candidate_params, X, y, return results - self._run_search(evaluate_candidates, X, y) + self._run_search(evaluate_candidates, X, y, **fit_params) # For multi-metric evaluation, store the best_index_, best_params_ and # best_score_ iff refit is one of the scorer names @@ -1125,7 +1126,7 @@ def __init__(self, estimator, param_grid, scoring=None, self.param_grid = param_grid _check_param_grid(param_grid) - def _run_search(self, evaluate_candidates, X, y): + def _run_search(self, evaluate_candidates, X, y, **fit_params): """Search all candidates in param_grid""" evaluate_candidates(ParameterGrid(self.param_grid), X, y) @@ -1453,7 +1454,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score) - def _run_search(self, evaluate_candidates, X, y): + def _run_search(self, evaluate_candidates, X, y, **fit_params): """Search n_iter candidates from param_distributions""" evaluate_candidates(ParameterSampler( self.param_distributions, self.n_iter, diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 885e8110be7ef..bf1d3e2748c3b 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -1,5 +1,6 @@ from math import ceil, floor, log from abc import abstractmethod +from collections import OrderedDict import numpy as np from ._search import _check_param_grid @@ -59,6 +60,9 @@ def __init__(self, estimator, scoring=None, def _check_input_parameters(self, X, y, groups): + if groups is not None: + raise ValueError('groups are not supported.') + if self.scoring is not None and not (isinstance(self.scoring, str) or callable(self.scoring)): raise ValueError('scoring parameter must be a string, ' @@ -126,18 +130,37 @@ def _check_input_parameters(self, X, y, groups): ) def fit(self, X, y=None, groups=None, **fit_params): + """Run fit with all sets of parameters. + + Parameters + ---------- + + X : array-like, shape (n_samples, n_features) + Training vector, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape (n_samples,) or (n_samples, n_output), optional + Target relative to X for classification or regression; + None for unsupervised learning. + + groups : None + Groups are not supported + + **fit_params : dict of string -> object + Parameters passed to the ``fit`` method of the estimator + """ self._check_input_parameters( X=X, y=y, groups=groups, ) - super().fit(X, y=y, groups=groups, **fit_params) + super().fit(X, y=y, groups=None, **fit_params) # Set best_score_: BaseSearchCV does not set it, as refit is a callable self.best_score_ = ( self.cv_results_['mean_test_score'][self.best_index_]) return self - def _run_search(self, evaluate_candidates, X, y): + def _run_search(self, evaluate_candidates, X, y, **fit_params): rng = check_random_state(self.random_state) candidate_params = self._generate_candidate_params() @@ -218,20 +241,29 @@ def _run_search(self, evaluate_candidates, X, y): print('r_i: {}'.format(r_i)) if self.budget_on == 'n_samples': + # Subsample X and y as well as fit_params stratify = y if is_classifier(self.estimator) else None - X_iter, y_iter = resample(X, y, replace=False, - random_state=rng, stratify=stratify, - n_samples=r_i) + fit_params = OrderedDict(fit_params) + X_iter, y_iter, *fit_params_iter_list = resample( + X, y, *fit_params.values(), replace=False, + random_state=rng, stratify=stratify, n_samples=r_i) + fit_params_iter = { + key: fit_params_iter_list[i] + for (i, key) in enumerate(fit_params.keys()) + } else: - # Need copy so that r_i of next iteration do not overwrite + # Need copy so that r_i of next iteration does not overwrite candidate_params = [c.copy() for c in candidate_params] for candidate in candidate_params: candidate[self.budget_on] = r_i X_iter, y_iter = X, y + fit_params_iter = fit_params + more_results = {'iter': [iter_i] * n_candidates, 'r_i': [r_i] * n_candidates} results = evaluate_candidates(candidate_params, X_iter, y_iter, - more_results=more_results) + more_results=more_results, + **fit_params_iter) n_candidates_to_keep = ceil(n_candidates / self.ratio) candidate_params = self._top_k(results, diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index b818af4879499..559f62981497c 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -274,3 +274,14 @@ def test_random_search(max_budget, n_candidates, expected_n_candidates_): # Make sure 'auto' makes the last iteration use as much budget as we # can assert sh._r_i_list[-1] == max_budget + + +def test_groups_not_supported(): + base_estimator = FastClassifier() + param_grid = {'a': [1]} + sh = HalvingRandomSearchCV(base_estimator, param_grid) + X, y = make_classification(n_samples=10) + groups = [0] * 10 + + with pytest.raises(ValueError, match="groups are not supported"): + sh.fit(X, y, groups) From 1cf9bf729c7e9d86f4cf864e8cadefe6dda93952 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 31 Jul 2019 17:03:03 -0400 Subject: [PATCH 22/89] parameter renaming --- doc/modules/grid_search.rst | 116 ++++---- .../plot_successive_halving_heatmap.py | 6 +- .../plot_successive_halving_iterations.py | 6 +- .../_search_successive_halving.py | 264 +++++++++--------- .../tests/test_successive_halving.py | 80 +++--- 5 files changed, 238 insertions(+), 234 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 7c3c4224742bf..e04504c97ac06 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -183,18 +183,18 @@ the rate at which the number of candidate decreases (more details in :ref:`amount_of_resource_and_number_of_candidates`) -Choosing ``r_min`` and the number of candidates ------------------------------------------------ +Choosing ``min_resources`` and the number of candidates +------------------------------------------------------- Beside ``ratio``, the two main parameters that influence the behaviour of a -successive halving search are the ``r_min`` parameter, and the number of -candidates (or parameter combinations) that are evaluated. ``r_min`` is the -amount of resources allocated at the first iteration for each candidate. The -number of candidates is specified directly in -:class:`HalvingRandomSearchCV`, and is determined from the ``param_grid`` +successive halving search are the ``min_resources`` parameter, and the +number of candidates (or parameter combinations) that are evaluated. +``min_resources`` is the amount of resources allocated at the first +iteration for each candidate. The number of candidates is specified directly +in :class:`HalvingRandomSearchCV`, and is determined from the ``param_grid`` parameter of :class:`HalvingGridSearchCV`. -Consider a case where we have 1000 samples. With ``r_min=10`` and +Consider a case where we have 1000 samples. With ``min_resources=10`` and ``ratio=2`` we are able to run 7 iterations, with the following number of samples: ``[10, 20, 40, 80, 160, 320, 640]``. @@ -207,29 +207,30 @@ In the case of :class:`HalvingGridSearchCV`, the number of candidates is set by default such that the maximum amount of resources is used at the last iteration. -Changing the value of ``r_min`` will impact the number of possible +Changing the value of ``min_resources`` will impact the number of possible iterations, and as a result will also have an effect on the ideal number of candidates. -Another consideration when choosing ``r_min`` is whether or not it is easy -to discriminate between good and bad candidates with a small amount of -resources. For example, if you need a lot of samples to distinguish between -good and bad parameters, a high ``r_min`` (possibly with the use of -``aggressive_elimination=True``) is recommended. On the other hand if the -distinction is clear even with a small amount of samples, then a small -``r_min`` may be preferable since it would speed up the computation. +Another consideration when choosing ``min_resources`` is whether or not it +is easy to discriminate between good and bad candidates with a small amount +of resources. For example, if you need a lot of samples to distinguish +between good and bad parameters, a high ``min_resources`` (possibly with the +use of ``aggressive_elimination=True``) is recommended. On the other hand if +the distinction is clear even with a small amount of samples, then a small +``min_resources`` may be preferable since it would speed up the computation. -By default, ``r_min`` is set to a small value (see docstrings for details) -that depends on the number of folds, and the number of classes for +By default, ``min_resources`` is set to a small value (see docstrings for +details) that depends on the number of folds, and the number of classes for classification problems. Depending on the setting, the default valueof -``r_min`` might not be ideal. +``min_resources`` might not be ideal. .. note:: Notice in the example above that the last iteration does not use the maximum amount of resources available: 1000 samples are available, yet - only 640 are used. Using ``force_exhaust_budget=True`` will set ``r_min`` - to a specific value such that the last iteration uses as many samples as - possible. Please see :ref:`exhausting_the_budget` for details. + only 640 are used. Using ``force_exhaust_resources=True`` will set + ``min_resources`` to a specific value such that the last iteration uses as + many samples as possible. Please see :ref:`exhausting_the_resources` for + details. .. _amount_of_resource_and_number_of_candidates: @@ -238,30 +239,30 @@ Amount of resource and number of candidates at each iteration The amount of resources ``r_i`` (e.g. the number of samples) allocated for each candidate at iteration ``i`` is controlled by the parameters ``ratio`` -and ``r_min`` as follows:: +and ``min_resources`` as follows:: - r_i = ratio**i * r_min + r_i = ratio**i * min_resources -``r_min`` is the amount of resources used at the first iteration and +``min_resources`` is the amount of resources used at the first iteration and ``ratio`` defines the proportions of candidates that will be selected for the next iteration:: n_candidates_to_keep = n_candidates_at_i // ratio -So in the first iteration, we use ``r_min`` resources ``n_candidates`` -times. In the second iteration, we use ``r_min * ratio`` resources +So in the first iteration, we use ``min_resources`` resources ``n_candidates`` +times. In the second iteration, we use ``min_resources * ratio`` resources ``n_candidates // ratio`` times. The third again multiplies the resources per candidate and divides the number of candidates. This process stops when the maximum amount of resource per candidate is reached, or when less than ``ratio`` candidates are left. -Here is an example with ``r_min=3`` and ``ratio=2``, starting with 70 +Here is an example with ``min_resources=3`` and ``ratio=2``, starting with 70 candidates: +-------------+-----------------------+ | ``r_i`` | ``n_candidates_at_i`` | +=============+=======================+ -| 3 (=r_min) | 70 (=n_candidates) | +| 3 (=min_resources) | 70 (=n_candidates) | +-------------+-----------------------+ | 3 * 2 = 6 | 70 // 2 = 35 | +-------------+-----------------------+ @@ -276,7 +277,7 @@ candidates: Ideally, at the last iteration, ``ratio`` candidates are evaluated, and we can pick the best one. Note that each ``r_i`` is a multiple of both -``ratio`` and ``r_min``. +``ratio`` and ``min_resources``. The amount of resource that is used at each iteration can be found using the `cv_results_` after converting it to a dataframe: @@ -288,7 +289,7 @@ Choosing a resource to budget By default, the budget is defined in terms of number of samples. That is, each iteration will use an increasing amount of samples to train on. You can however manually specify a parameter to use as the budget with the -``budget_on`` parameter. Here is an example where the budget is defined in +``resource`` parameter. Here is an example where the budget is defined in terms of the number of estimators of a random forest:: >>> from sklearn.datasets import make_classification @@ -301,21 +302,22 @@ terms of the number of estimators of a random forest:: >>> base_estimator = RandomForestClassifier(random_state=0) >>> X, y = make_classification(n_samples=1000, random_state=0) >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2, budget_on='n_estimators', - ... max_budget=30, random_state=0).fit(X, y) + ... ratio=2, resource='n_estimators', + ... max_resources=30, random_state=0).fit(X, y) >>> sh.best_estimator_ RandomForestClassifier(max_depth=5, n_estimators=8, random_state=0) Note that it is not possible to budget on a parameter that is part of the parameter grid. -.. _exhausting_the_budget: +.. _exhausting_the_resources: -Exhausting the budget ---------------------- +Exhausting the available resources +---------------------------------- -As mentioned above, the first iteration uses ``r_min`` resources. If you have -a big budget, this may be a waste of resources:: +As mentioned above, the first iteration uses ``min_resources`` resources. If +you have a lots of resources available, some of them might be wasted (not +used):: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC @@ -335,13 +337,14 @@ a big budget, this may be a waste of resources:: 2 [80] Name: r_i, dtype: object -The search process will only use 80 resources at most, while our maximum budget -is ``n_samples=1000``. Note in this case that ``r_min = r_0 = 20``. In order -for the last iteration to use as many resources as possible, you can use the -``force_exhaust_budget`` parameter.:: +The search process will only use 80 resources at most, while our maximum +amount of available resources is ``n_samples=1000``. Note in this case that +``min_resources = r_0 = 20``. In order for the last iteration to use as many +resources as possible, you can use the ``force_exhaust_resources`` +parameter.:: >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2, force_exhaust_budget=True, + ... ratio=2, force_exhaust_resources=True, ... ).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) >>> results.groupby('iter')['r_i'].unique() @@ -351,18 +354,19 @@ for the last iteration to use as many resources as possible, you can use the 2 [1000] Name: r_i, dtype: object - -`r_min` was here automatically set to 250, which results in the last -iteration using all the budget. Since ``force_exhaust_budget`` chooses an -appropriate ``r_min`` to start with, ``r_min`` must be set to 'auto' (default). +`min_resources` was here automatically set to 250, which results in the last +iteration using all the resources. Since ``force_exhaust_resources`` chooses an +appropriate ``min_resources`` to start with, ``min_resources`` must be set +to 'auto' (default). Aggressive elimination of candidates ------------------------------------ Ideally, we want the last iteration to evaluate ``ratio`` candidates. We then -just have to pick the best one. When the budget is small with respect to -the number of candidates, the last iteration may have to evaluate more than -``ratio`` candidates:: +just have to pick the best one. When the number of available resources is +small with respect to the number of candidates, the last iteration may have +to evaluate more than ``ratio`` candidates:: + >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC >>> from sklearn.model_selection import HalvingGridSearchCV @@ -374,7 +378,7 @@ the number of candidates, the last iteration may have to evaluate more than >>> base_estimator = SVC(gamma='scale') >>> X, y = make_classification(n_samples=1000) >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2, max_budget=40, + ... ratio=2, max_resources=40, ... aggressive_elimination=False).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) >>> results.groupby('iter').r_i.unique() @@ -388,17 +392,17 @@ the number of candidates, the last iteration may have to evaluate more than 1 3 Name: r_i, dtype: int64 -Since we cannot use more than ``max_budget=40`` resources, the process has to +Since we cannot use more than ``max_resources=40`` resources, the process has to stop at the second iteration which evaluates more than ``ratio=2`` candidates. Using the ``aggressive_elimination`` parameter, you can force the search process to end up with less than ``ratio`` candidates at the last iteration. To do this, the process will eliminate as many candidates as -necessary using ``r_min`` resources:: +necessary using ``min_resources`` resources:: >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, ... ratio=2, - ... max_budget=40, + ... max_resources=40, ... aggressive_elimination=True, ... ).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) @@ -416,8 +420,8 @@ necessary using ``r_min`` resources:: Name: r_i, dtype: int64 Notice that we end with 2 candidates at the last iteration since we have -eliminated enough candidates during the first iterations, using ``r_i = r_min = -20``. +eliminated enough candidates during the first iterations, using ``r_i = +min_resources = 20``. .. topic:: References: diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index efd800b58edf3..21eb00dd70ed3 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -26,9 +26,9 @@ gsh = HalvingGridSearchCV( estimator=clf, param_grid=param_grid, - budget_on='n_samples', # budget is the number of samples - max_budget='auto', # max_budget=n_samples - force_exhaust_budget=True, + resource='n_samples', + max_resources='auto', # max_resources=n_samples + force_exhaust_resources=True, cv=5, ratio=2, random_state=rng) diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index ffbfe0653988d..4e0fef699eeef 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -27,9 +27,9 @@ rsh = HalvingRandomSearchCV( estimator=clf, param_distributions=param_dist, - budget_on='n_samples', # budget is the number of samples - max_budget='auto', # max_budget=n_samples - n_candidates='auto', # choose n_cdts so that last iter exhausts budget + resource='n_samples', + max_resources='auto', # max_resources=n_samples + n_candidates='auto', # choose n_cdts so that last iter exhausts resources cv=5, ratio=2, random_state=rng) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index bf1d3e2748c3b..d3fd67a560629 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -39,9 +39,9 @@ def __init__(self, estimator, scoring=None, n_jobs=None, refit=True, cv=5, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=True, - max_budget='auto', budget_on='n_samples', ratio=3, - r_min='auto', aggressive_elimination=False, - force_exhaust_budget=False): + max_resources='auto', min_resources='auto', + resource='n_samples', ratio=3, aggressive_elimination=False, + force_exhaust_resources=False): refit = _refit_callable if refit else False super().__init__(estimator, scoring=scoring, @@ -51,12 +51,12 @@ def __init__(self, estimator, scoring=None, return_train_score=return_train_score) self.random_state = random_state - self.max_budget = max_budget - self.budget_on = budget_on + self.max_resources = max_resources + self.resource = resource self.ratio = ratio - self.r_min = r_min + self.min_resources = min_resources self.aggressive_elimination = aggressive_elimination - self.force_exhaust_budget = force_exhaust_budget + self.force_exhaust_resources = force_exhaust_resources def _check_input_parameters(self, X, y, groups): @@ -68,65 +68,65 @@ def _check_input_parameters(self, X, y, groups): raise ValueError('scoring parameter must be a string, ' 'a callable or None.') - if (self.budget_on != 'n_samples' - and self.budget_on not in self.estimator.get_params()): + if (self.resource != 'n_samples' + and self.resource not in self.estimator.get_params()): raise ValueError( - 'Cannot budget on parameter {} which is not supported ' - 'by estimator {}'.format(self.budget_on, + 'Cannot use resource={} which is not supported ' + 'by estimator {}'.format(self.resource, self.estimator.__class__.__name__)) - if isinstance(self.max_budget, str) and self.max_budget != 'auto': + if isinstance(self.max_resources, str) and self.max_resources != 'auto': raise ValueError( - "max_budget must be either 'auto' or a positive number" + "max_resources must be either 'auto' or a positive number" ) - if self.max_budget != 'auto' and self.max_budget <= 0: + if self.max_resources != 'auto' and self.max_resources <= 0: raise ValueError( - "max_budget must be either 'auto' or a positive number" + "max_resources must be either 'auto' or a positive number" ) - if isinstance(self.r_min, str) and self.r_min != 'auto': + if isinstance(self.min_resources, str) and self.min_resources != 'auto': raise ValueError( - "r_min must be either 'auto' or a positive number no greater " - "than max_budget." + "min_resources must be either 'auto' or a positive number no greater " + "than max_resources." ) - if self.r_min != 'auto' and self.r_min <= 0: + if self.min_resources != 'auto' and self.min_resources <= 0: raise ValueError( - "r_min must be either 'auto' or a positive number no greater " - "than max_budget." + "min_resources must be either 'auto' or a positive number no greater " + "than max_resources." ) - if self.force_exhaust_budget and self.r_min != 'auto': + if self.force_exhaust_resources and self.min_resources != 'auto': raise ValueError( - 'r_min must be set to auto if force_exhaust_budget is True.' + 'min_resources must be set to auto if force_exhaust_resources is True.' ) - self.r_min_ = self.r_min - if self.r_min_ == 'auto': - if self.budget_on == 'n_samples': + self.min_resources_ = self.min_resources + if self.min_resources_ == 'auto': + if self.resource == 'n_samples': cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) n_splits = cv.get_n_splits(X, y, groups) # please see https://gph.is/1KjihQe for a justification magic_factor = 2 - self.r_min_ = n_splits * magic_factor + self.min_resources_ = n_splits * magic_factor if is_classifier(self.estimator): n_classes = np.unique(y).shape[0] - self.r_min_ *= n_classes + self.min_resources_ *= n_classes else: - self.r_min_ = 1 + self.min_resources_ = 1 - self.max_budget_ = self.max_budget - if self.max_budget_ == 'auto': - if not self.budget_on == 'n_samples': + self.max_resources_ = self.max_resources + if self.max_resources_ == 'auto': + if not self.resource == 'n_samples': raise ValueError( - "max_budget can only be 'auto' if budget_on='n_samples'") - self.max_budget_ = _num_samples(X) + "max_resources can only be 'auto' if resource='n_samples'") + self.max_resources_ = _num_samples(X) - if self.r_min_ > self.max_budget_: + if self.min_resources_ > self.max_resources_: raise ValueError( - 'r_min_={} is greater than max_budget_={}.' - .format(self.r_min_, self.max_budget_) + 'min_resources_={} is greater than max_resources_={}.' + .format(self.min_resources_, self.max_resources_) ) def fit(self, X, y=None, groups=None, **fit_params): @@ -169,32 +169,32 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): candidate_params = [dict(t) for t in candidate_params] self.n_candidates_ = len(candidate_params) - if self.budget_on != 'n_samples' and any( - self.budget_on in candidate for candidate in candidate_params): + if self.resource != 'n_samples' and any( + self.resource in candidate for candidate in candidate_params): # Can only check this now since we need the candidates list raise ValueError( "Cannot budget on parameter {} since it is part of " - "the searched parameters.".format(self.budget_on)) + "the searched parameters.".format(self.resource)) # n_required_iterations is the number of iterations needed so that the # last iterations evaluates less than `ratio` candidates. n_required_iterations = 1 + floor(log(self.n_candidates_, self.ratio)) - if self.force_exhaust_budget: - # To exhaust the budget, we want to start with the biggest r_min + if self.force_exhaust_resources: + # To exhaust the budget, we want to start with the biggest min_resources # possible so that the last (required) iteration uses as many # resources as possible - # We only force exhausting the budget if r_min wasn't specified by + # We only force exhausting the budget if min_resources wasn't specified by # the user. last_iteration = n_required_iterations - 1 - self.r_min_ = max(self.r_min_, - self.max_budget_ // self.ratio**last_iteration) + self.min_resources_ = max(self.min_resources_, + self.max_resources_ // self.ratio**last_iteration) # n_possible_iterations is the number of iterations that we can - # actually do starting from r_min and without exceeding the budget. + # actually do starting from min_resources and without exceeding the budget. # Depending on budget size the number of candidates, this may be higher # or smaller than n_required_iterations. - n_possible_iterations = 1 + floor(log(self.max_budget_ // self.r_min_, + n_possible_iterations = 1 + floor(log(self.max_resources_ // self.min_resources_, self.ratio)) if self.aggressive_elimination: @@ -206,11 +206,11 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): print('n_iterations: {}'.format(n_iterations)) print('n_required_iterations: {}'.format(n_required_iterations)) print('n_possible_iterations: {}'.format(n_possible_iterations)) - print('r_min_: {}'.format(self.r_min_)) - print('max_budget_: {}'.format(self.max_budget_)) + print('min_resources_: {}'.format(self.min_resources_)) + print('max_resources_: {}'.format(self.max_resources_)) print('aggressive_elimination: {}'.format( self.aggressive_elimination)) - print('force_exhaust_budget: {}'.format(self.force_exhaust_budget)) + print('force_exhaust_resources: {}'.format(self.force_exhaust_resources)) print('ratio: {}'.format(self.ratio)) self._r_i_list = [] # list of r_i for each iteration, used in tests @@ -228,8 +228,8 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): iter_i - n_required_iterations + n_possible_iterations ) - r_i = int(self.ratio**power * self.r_min_) - r_i = min(r_i, self.max_budget_) # guard, probably not needed + r_i = int(self.ratio**power * self.min_resources_) + r_i = min(r_i, self.max_resources_) # guard, probably not needed self._r_i_list.append(r_i) n_candidates = len(candidate_params) @@ -240,7 +240,7 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): print('n_candidates: {}'.format(n_candidates)) print('r_i: {}'.format(r_i)) - if self.budget_on == 'n_samples': + if self.resource == 'n_samples': # Subsample X and y as well as fit_params stratify = y if is_classifier(self.estimator) else None fit_params = OrderedDict(fit_params) @@ -255,7 +255,7 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): # Need copy so that r_i of next iteration does not overwrite candidate_params = [c.copy() for c in candidate_params] for candidate in candidate_params: - candidate[self.budget_on] = r_i + candidate[self.resource] = r_i X_iter, y_iter = X, y fit_params_iter = fit_params @@ -383,39 +383,39 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): expensive and is not strictly required to select the parameters that yield the best generalization performance. - max_budget : int, default='auto' + max_resources : int, default='auto' The maximum number of resources that any candidate is allowed to use for a given iteration. By default, this is set ``n_samples`` when - ``budget_on='n_samples'`` (default), else an error is raised. + ``resource='n_samples'`` (default), else an error is raised. - budget_on : ``'n_samples'`` or str, default='n_samples' - Defines the nature of the budget. By default, the budget is the number - of samples. It can also be set to any parameter of the base estimator - that accepts positive integer values, e.g. 'n_iterations' or - 'n_estimators' for a gradient boosting estimator. In this case - ``max_budget`` cannot be 'auto'. - - ratio : int or float, default=3 - The 'halving' parameter, which determines the proportion of candidates - that are selected for the next iteration. For example, ``ratio=3`` - means that only one third of the candidates are selected. - - r_min : int, default='auto' + min_resources : int, default='auto' The minimum amount of resource that any candidate is allowed to use for a given iteration. Equivalently, this defines the amount of resources that are allocated for each candidate at the first iteration. By default, this is set to: - - ``n_splits * 2`` when ``budget_on='n_samples'`` for a regression + - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem - - ``n_classes * n_splits * 2`` when ``budget_on='n_samples'`` for a + - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a regression problem - The highest possible value satisfying the constraint - ``force_exhaust_budget=True``. - - ``1`` when ``budget_on!='n_samples'`` + ``force_exhaust_resources=True``. + - ``1`` when ``resource!='n_samples'`` Note that the amount of resources used at each iteration is always a - multiple of ``r_min``. + multiple of ``min_resources``. + + resource : ``'n_samples'`` or str, default='n_samples' + Defines the resource that increases with each iteration. By default, + the resource is the number of samples. It can also be set to any + parameter of the base estimator that accepts positive integer + values, e.g. 'n_iterations' or 'n_estimators' for a gradient + boosting estimator. In this case ``max_resources`` cannot be 'auto'. + + ratio : int or float, default=3 + The 'halving' parameter, which determines the proportion of candidates + that are selected for the next iteration. For example, ``ratio=3`` + means that only one third of the candidates are selected. aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough budget to @@ -425,11 +425,11 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): ``False`` by default, which means that the last iteration may evaluate more than ``ratio`` candidates. - force_exhaust_budget : bool, default=False - If True, then ``r_min`` is set to a specific value such that the + force_exhaust_resources : bool, default=False + If True, then ``min_resources`` is set to a specific value such that the last iteration uses as much budget as possible. Namely, the last - iteration uses the highest value smaller than ``max_budget`` that is a - multiple of both ``r_min`` and ``ratio``. + iteration uses the highest value smaller than ``max_resources`` that is a + multiple of both ``min_resources`` and ``ratio``. Attributes ---------- @@ -441,14 +441,14 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): The number of candidate parameters that are left after the last iteration. - max_budget_ : int + max_resources_ : int The maximum number of resources that any candidate is allowed to use for a given iteration. Note that since the number of resources used at - each iteration must be a multiple of ``r_min_``, the actual number of + each iteration must be a multiple of ``min_resources_``, the actual number of resources used at the last iteartion may be smaller than - ``max_budget_``. + ``max_resources_``. - r_min_ : int + min_resources_ : int The amount of resources that are allocated for each candidate at the first iteration. @@ -459,12 +459,12 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): n_required_iterations_)``. n_possible_iterations_ : int - The number of iterations that are possible starting with ``r_min_`` - resources and without exceeding ``max_budget_``. + The number of iterations that are possible starting with ``min_resources_`` + resources and without exceeding ``max_resources_``. n_required_iterations_ : int The number of iterations that are required to end up with less than - ``ratio`` candidates at the last iteration, starting with ``r_min_`` + ``ratio`` candidates at the last iteration, starting with ``min_resources_`` resources. This will be smaller than ``n_possible_iterations_`` when there isn't enough budget. @@ -560,7 +560,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): See Also -------- - :class:`RandomHalvingSearchCV`: + :class:`HalvingRandomSearchCV`: Random search over a set of parameters using successive halving. """ _required_parameters = ["estimator", "param_grid"] @@ -569,18 +569,18 @@ def __init__(self, estimator, param_grid, scoring=None, n_jobs=None, refit=True, verbose=0, cv=5, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=True, - max_budget='auto', budget_on='n_samples', ratio=3, - r_min='auto', aggressive_elimination=False, - force_exhaust_budget=False): + max_resources='auto', min_resources='auto', + resource='n_samples', ratio=3, aggressive_elimination=False, + force_exhaust_resources=False): super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, pre_dispatch=pre_dispatch, random_state=random_state, error_score=error_score, return_train_score=return_train_score, - max_budget=max_budget, budget_on=budget_on, - ratio=ratio, r_min=r_min, + max_resources=max_resources, resource=resource, + ratio=ratio, min_resources=min_resources, aggressive_elimination=aggressive_elimination, - force_exhaust_budget=force_exhaust_budget) + force_exhaust_resources=force_exhaust_resources) self.param_grid = param_grid _check_param_grid(self.param_grid) @@ -613,7 +613,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): n_candidates: int, default='auto' The number of candidate parameters to sample. By default this will sample enough candidates so that the last iteration uses as many - resources as possible. Note that ``force_exhaust_budget`` has no + resources as possible. Note that ``force_exhaust_resources`` has no effect in this case. scoring : string, callable, or None, default=None @@ -685,39 +685,39 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): expensive and is not strictly required to select the parameters that yield the best generalization performance. - max_budget : int, default='auto' + max_resources : int, default='auto' The maximum number of resources that any candidate is allowed to use for a given iteration. By default, this is set ``n_samples`` when - ``budget_on='n_samples'`` (default), else an error is raised. + ``resource='n_samples'`` (default), else an error is raised. - budget_on : ``'n_samples'`` or str, default='n_samples' - Defines the nature of the budget. By default, the budget is the number - of samples. It can also be set to any parameter of the base estimator - that accepts positive integer values, e.g. 'n_iterations' or - 'n_estimators' for a gradient boosting estimator. In this case - ``max_budget`` cannot be 'auto'. - - ratio : int or float, default=3 - The 'halving' parameter, which determines the proportion of candidates - that are selected for the next iteration. For example, ``ratio=3`` - means that only one third of the candidates are selected. - - r_min : int, default='auto' + min_resources : int, default='auto' The minimum amount of resource that any candidate is allowed to use for a given iteration. Equivalently, this defines the amount of resources that are allocated for each candidate at the first iteration. By default, this is set to: - - ``n_splits * 2`` when ``budget_on='n_samples'`` for a regression + - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem - - ``n_classes * n_splits * 2`` when ``budget_on='n_samples'`` for a + - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a regression problem - The highest possible value satisfying the constraint - ``force_exhaust_budget=True``. - - ``1`` when ``budget_on!='n_samples'`` + ``force_exhaust_resources=True``. + - ``1`` when ``resource!='n_samples'`` Note that the amount of resources used at each iteration is always a - multiple of ``r_min``. + multiple of ``min_resources``. + + resource : ``'n_samples'`` or str, default='n_samples' + Defines the resource that increases with each iteration. By default, + the resource is the number of samples. It can also be set to any + parameter of the base estimator that accepts positive integer + values, e.g. 'n_iterations' or 'n_estimators' for a gradient + boosting estimator. In this case ``max_resources`` cannot be 'auto'. + + ratio : int or float, default=3 + The 'halving' parameter, which determines the proportion of candidates + that are selected for the next iteration. For example, ``ratio=3`` + means that only one third of the candidates are selected. aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough budget to @@ -727,11 +727,11 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): ``False`` by default, which means that the last iteration may evaluate more than ``ratio`` candidates. - force_exhaust_budget : bool, default=False - If True, then ``r_min`` is set to a specific value such that the + force_exhaust_resources : bool, default=False + If True, then ``min_resources`` is set to a specific value such that the last iteration uses as much budget as possible. Namely, the last - iteration uses the highest value smaller than ``max_budget`` that is a - multiple of both ``r_min`` and ``ratio``. + iteration uses the highest value smaller than ``max_resources`` that is a + multiple of both ``min_resources`` and ``ratio``. Attributes ---------- @@ -743,14 +743,14 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): The number of candidate parameters that are left after the last iteration. - max_budget_ : int + max_resources_ : int The maximum number of resources that any candidate is allowed to use for a given iteration. Note that since the number of resources used at - each iteration must be a multiple of ``r_min_``, the actual number of + each iteration must be a multiple of ``min_resources_``, the actual number of resources used at the last iteartion may be smaller than - ``max_budget_``. + ``max_resources_``. - r_min_ : int + min_resources_ : int The amount of resources that are allocated for each candidate at the first iteration. @@ -761,12 +761,12 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): n_required_iterations_)``. n_possible_iterations_ : int - The number of iterations that are possible starting with ``r_min_`` - resources and without exceeding ``max_budget_``. + The number of iterations that are possible starting with ``min_resources_`` + resources and without exceeding ``max_resources_``. n_required_iterations_ : int The number of iterations that are required to end up with less than - ``ratio`` candidates at the last iteration, starting with ``r_min_`` + ``ratio`` candidates at the last iteration, starting with ``min_resources_`` resources. This will be smaller than ``n_possible_iterations_`` when there isn't enough budget. @@ -862,7 +862,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): See Also -------- - :class:`GridHalvingSearchCV`: + :class:`HalvingGridSearchCV`: Search over a grid of parameters using successive halving. """ _required_parameters = ["estimator", "param_distributions"] @@ -871,17 +871,17 @@ def __init__(self, estimator, param_distributions, n_candidates='auto', scoring=None, n_jobs=None, refit=True, verbose=0, cv=5, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, - return_train_score=True, max_budget='auto', - budget_on='n_samples', ratio=3, r_min='auto', - aggressive_elimination=False, force_exhaust_budget=False): + return_train_score=True, max_resources='auto', + min_resources='auto', resource='n_samples', ratio=3, + aggressive_elimination=False, force_exhaust_resources=False): super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, random_state=random_state, error_score=error_score, return_train_score=return_train_score, - max_budget=max_budget, budget_on=budget_on, - ratio=ratio, r_min=r_min, + max_resources=max_resources, resource=resource, + ratio=ratio, min_resources=min_resources, aggressive_elimination=aggressive_elimination, - force_exhaust_budget=force_exhaust_budget) + force_exhaust_resources=force_exhaust_resources) self.param_distributions = param_distributions self.n_candidates = n_candidates @@ -890,6 +890,6 @@ def _generate_candidate_params(self): if n_candidates_ == 'auto': # This will generate enough candidate so that the last iteration # uses as much budget as possible - n_candidates_ = self.max_budget_ // self.r_min_ + n_candidates_ = self.max_resources_ // self.min_resources_ return ParameterSampler(self.param_distributions, n_candidates_, self.random_state) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 559f62981497c..af31640f80d57 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -36,13 +36,13 @@ def test_aggressive_elimination(): # aggressive_elimination is only really relevant when there is not enough # budget. - max_budget = 180 + max_resources = 180 # aggressive_elimination=True - # In this case, the first iterations only use r_min_ resources + # In this case, the first iterations only use min_resources_ resources sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=True, - max_budget=max_budget, ratio=ratio) + max_resources=max_resources, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 4 assert sh.n_required_iterations_ == 4 @@ -54,7 +54,7 @@ def test_aggressive_elimination(): sh = HalvingRandomSearchCV(base_estimator, parameters, n_candidates=60, cv=5, aggressive_elimination=True, - max_budget=max_budget, ratio=ratio) + max_resources=max_resources, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 4 assert sh.n_required_iterations_ == 4 @@ -67,7 +67,7 @@ def test_aggressive_elimination(): # candidates at the last iteration sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=False, - max_budget=max_budget, ratio=ratio) + max_resources=max_resources, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 3 @@ -76,14 +76,14 @@ def test_aggressive_elimination(): assert sh._r_i_list == [20, 60, 180] assert sh.n_remaining_candidates_ == 3 - max_budget = n_samples + max_resources = n_samples # with enough budget, aggressive_elimination has no effect since it is not # needed # aggressive_elimination=True sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=True, - max_budget=max_budget, ratio=ratio) + max_resources=max_resources, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 4 @@ -95,7 +95,7 @@ def test_aggressive_elimination(): # aggressive_elimination=False sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, aggressive_elimination=False, - max_budget=max_budget, ratio=ratio) + max_resources=max_resources, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 4 @@ -105,10 +105,10 @@ def test_aggressive_elimination(): assert sh.n_remaining_candidates_ == 1 -def test_force_exhaust_budget_false(): - # Test the force_exhaust_budget parameter when it's false or ignored. +def test_force_exhaust_resources_false(): + # Test the force_exhaust_resources parameter when it's false or ignored. # This is the default case: we start at the beginning no matter what since - # we do not overwrite r_min_ + # we do not overwrite min_resources_ n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) @@ -118,17 +118,17 @@ def test_force_exhaust_budget_false(): # with enough budget sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - force_exhaust_budget=False, ratio=ratio) + force_exhaust_resources=False, ratio=ratio) sh.fit(X, y) assert sh.n_iterations_ == 2 assert sh.n_required_iterations_ == 2 assert sh.n_possible_iterations_ == 4 assert sh._r_i_list == [20, 60] - # with enough budget but r_min!='auto': ignored + # with enough budget but min_resources!='auto': ignored sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - force_exhaust_budget=False, ratio=ratio, - r_min=50) + force_exhaust_resources=False, ratio=ratio, + min_resources=50) sh.fit(X, y) assert sh.n_iterations_ == 2 assert sh.n_required_iterations_ == 2 @@ -137,8 +137,8 @@ def test_force_exhaust_budget_false(): # without enough budget (budget is exhausted anyway) sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - force_exhaust_budget=False, ratio=ratio, - max_budget=30) + force_exhaust_resources=False, ratio=ratio, + max_resources=30) sh.fit(X, y) assert sh.n_iterations_ == 1 assert sh.n_required_iterations_ == 2 @@ -146,7 +146,7 @@ def test_force_exhaust_budget_false(): assert sh._r_i_list == [20] -@pytest.mark.parametrize('max_budget, r_i_list', [ +@pytest.mark.parametrize('max_resources, r_i_list', [ ('auto', [333, 999]), (1000, [333, 999]), (999, [333, 999]), @@ -157,9 +157,9 @@ def test_force_exhaust_budget_false(): (50, [20]), (20, [20]), ]) -def test_force_exhaust_budget_true(max_budget, r_i_list): - # Test the force_exhaust_budget parameter when it's true - # in this case we need to change r_min so that the last iteration uses as +def test_force_exhaust_budget_true(max_resources, r_i_list): + # Test the force_exhaust_resources parameter when it's true + # in this case we need to change min_resources so that the last iteration uses as # much budget as possible n_samples = 1000 @@ -168,8 +168,8 @@ def test_force_exhaust_budget_true(max_budget, r_i_list): base_estimator = FastClassifier() ratio = 3 sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - force_exhaust_budget=True, ratio=ratio, - max_budget=max_budget) + force_exhaust_resources=True, ratio=ratio, + max_resources=max_resources) sh.fit(X, y) assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh._r_i_list) @@ -177,8 +177,8 @@ def test_force_exhaust_budget_true(max_budget, r_i_list): # Test same for randomized search sh = HalvingRandomSearchCV(base_estimator, parameters, n_candidates=6, - cv=5, force_exhaust_budget=True, - ratio=ratio, max_budget=max_budget) + cv=5, force_exhaust_resources=True, + ratio=ratio, max_resources=max_resources) sh.fit(X, y) assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh._r_i_list) @@ -186,7 +186,7 @@ def test_force_exhaust_budget_true(max_budget, r_i_list): @pytest.mark.parametrize( - 'max_budget, n_iterations, n_possible_iterations', [ + 'max_resources, n_iterations, n_possible_iterations', [ ('auto', 5, 9), # whole budget is used (1024, 5, 9), (700, 5, 8), @@ -195,11 +195,11 @@ def test_force_exhaust_budget_true(max_budget, r_i_list): (32, 4, 4), (31, 3, 3), (16, 3, 3), - (4, 1, 1), # max_budget == r_min, only one iteration is possible + (4, 1, 1), # max_resources == min_resources, only one iteration is possible ]) -def test_n_iterations(max_budget, n_iterations, n_possible_iterations): +def test_n_iterations(max_resources, n_iterations, n_possible_iterations): # test the number of actual iterations that were run depending on - # max_budget + # max_resources n_samples = 1024 X, y = make_classification(n_samples=n_samples, random_state=1) @@ -208,22 +208,22 @@ def test_n_iterations(max_budget, n_iterations, n_possible_iterations): ratio = 2 sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, ratio=ratio, - max_budget=max_budget, r_min=4) + max_resources=max_resources, min_resources=4) sh.fit(X, y) assert sh.n_required_iterations_ == 5 assert sh.n_iterations_ == n_iterations assert sh.n_possible_iterations_ == n_possible_iterations -def test_budget_on(): - # Test the budget_on parameter +def test_resource_parameter(): + # Test the resource parameter n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, - budget_on='c', max_budget=10, ratio=3) + resource='c', max_resources=10, ratio=3) sh.fit(X, y) assert set(sh._r_i_list) == set([1, 3, 9]) for r_i, params, param_c in zip(sh.cv_results_['r_i'], @@ -233,9 +233,9 @@ def test_budget_on(): with pytest.raises( ValueError, - match='Cannot budget on parameter 1234 which is not supported '): + match='Cannot use resource=1234 which is not supported '): sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, - budget_on='1234', max_budget=10) + resource='1234', max_resources=10) sh.fit(X, y) with pytest.raises( @@ -244,19 +244,19 @@ def test_budget_on(): 'searched parameters.'): parameters = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]} sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, - budget_on='c', max_budget=10) + resource='c', max_resources=10) sh.fit(X, y) @pytest.mark.parametrize( - 'max_budget, n_candidates, expected_n_candidates_', [ + 'max_resources, n_candidates, expected_n_candidates_', [ (512, 'auto', 128), # generate exactly as much as needed (32, 'auto', 8), (32, 8, 8), (32, 7, 7), # ask for less than what we could (32, 9, 9), # ask for more than 'reasonable' ]) -def test_random_search(max_budget, n_candidates, expected_n_candidates_): +def test_random_search(max_resources, n_candidates, expected_n_candidates_): # Test random search and make sure the number of generated candidates is as # expected @@ -267,13 +267,13 @@ def test_random_search(max_budget, n_candidates, expected_n_candidates_): sh = HalvingRandomSearchCV(base_estimator, parameters, n_candidates=n_candidates, cv=2, - max_budget=max_budget, ratio=2, r_min=4) + max_resources=max_resources, ratio=2, min_resources=4) sh.fit(X, y) assert sh.n_candidates_ == expected_n_candidates_ if n_candidates == 'auto': # Make sure 'auto' makes the last iteration use as much budget as we # can - assert sh._r_i_list[-1] == max_budget + assert sh._r_i_list[-1] == max_resources def test_groups_not_supported(): From a91b119d727c9d9c06dbf07768fbdfb6620c1a49 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 31 Jul 2019 17:10:48 -0400 Subject: [PATCH 23/89] pep8 --- .../_search_successive_halving.py | 103 ++++++++++-------- .../tests/test_successive_halving.py | 13 ++- 2 files changed, 62 insertions(+), 54 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index d3fd67a560629..6bcdeb20500ca 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -75,7 +75,8 @@ def _check_input_parameters(self, X, y, groups): 'by estimator {}'.format(self.resource, self.estimator.__class__.__name__)) - if isinstance(self.max_resources, str) and self.max_resources != 'auto': + if (isinstance(self.max_resources, str) and + self.max_resources != 'auto'): raise ValueError( "max_resources must be either 'auto' or a positive number" ) @@ -84,20 +85,22 @@ def _check_input_parameters(self, X, y, groups): "max_resources must be either 'auto' or a positive number" ) - if isinstance(self.min_resources, str) and self.min_resources != 'auto': + if (isinstance(self.min_resources, str) and + self.min_resources != 'auto'): raise ValueError( - "min_resources must be either 'auto' or a positive number no greater " - "than max_resources." + "min_resources must be either 'auto' or a positive number " + "no greater than max_resources." ) if self.min_resources != 'auto' and self.min_resources <= 0: raise ValueError( - "min_resources must be either 'auto' or a positive number no greater " - "than max_resources." + "min_resources must be either 'auto' or a positive number " + "no greater than max_resources." ) if self.force_exhaust_resources and self.min_resources != 'auto': raise ValueError( - 'min_resources must be set to auto if force_exhaust_resources is True.' + 'min_resources must be set to auto if force_exhaust_resources' + ' is True.' ) self.min_resources_ = self.min_resources @@ -181,21 +184,23 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): n_required_iterations = 1 + floor(log(self.n_candidates_, self.ratio)) if self.force_exhaust_resources: - # To exhaust the budget, we want to start with the biggest min_resources - # possible so that the last (required) iteration uses as many - # resources as possible - # We only force exhausting the budget if min_resources wasn't specified by - # the user. + # To exhaust the budget, we want to start with the biggest + # min_resources possible so that the last (required) iteration + # uses as many resources as possible + # We only force exhausting the budget if min_resources wasn't + # specified by the user. last_iteration = n_required_iterations - 1 - self.min_resources_ = max(self.min_resources_, - self.max_resources_ // self.ratio**last_iteration) + self.min_resources_ = max( + self.min_resources_, + self.max_resources_ // self.ratio**last_iteration + ) # n_possible_iterations is the number of iterations that we can - # actually do starting from min_resources and without exceeding the budget. - # Depending on budget size the number of candidates, this may be higher - # or smaller than n_required_iterations. - n_possible_iterations = 1 + floor(log(self.max_resources_ // self.min_resources_, - self.ratio)) + # actually do starting from min_resources and without exceeding the + # budget. Depending on budget size the number of candidates, this may + # be higher or smaller than n_required_iterations. + n_possible_iterations = 1 + floor(log( + self.max_resources_ // self.min_resources_, self.ratio)) if self.aggressive_elimination: n_iterations = n_required_iterations @@ -210,7 +215,8 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): print('max_resources_: {}'.format(self.max_resources_)) print('aggressive_elimination: {}'.format( self.aggressive_elimination)) - print('force_exhaust_resources: {}'.format(self.force_exhaust_resources)) + print('force_exhaust_resources: {}'.format( + self.force_exhaust_resources)) print('ratio: {}'.format(self.ratio)) self._r_i_list = [] # list of r_i for each iteration, used in tests @@ -219,8 +225,8 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): power = iter_i # default if self.aggressive_elimination: - # this will set r_i to the initial value (i.e. the value of r_i - # at the first iteration) for as many iterations as needed + # this will set r_i to the initial value (i.e. the value of + # r_i at the first iteration) for as many iterations as needed # (while candidates are being eliminated), and then go on as # usual. power = max( @@ -426,10 +432,10 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): more than ``ratio`` candidates. force_exhaust_resources : bool, default=False - If True, then ``min_resources`` is set to a specific value such that the - last iteration uses as much budget as possible. Namely, the last - iteration uses the highest value smaller than ``max_resources`` that is a - multiple of both ``min_resources`` and ``ratio``. + If True, then ``min_resources`` is set to a specific value such that + the last iteration uses as much budget as possible. Namely, the last + iteration uses the highest value smaller than ``max_resources`` that + is a multiple of both ``min_resources`` and ``ratio``. Attributes ---------- @@ -443,10 +449,10 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): max_resources_ : int The maximum number of resources that any candidate is allowed to use - for a given iteration. Note that since the number of resources used at - each iteration must be a multiple of ``min_resources_``, the actual number of - resources used at the last iteartion may be smaller than - ``max_resources_``. + for a given iteration. Note that since the number of resources used + at each iteration must be a multiple of ``min_resources_``, the + actual number of resources used at the last iteartion may be smaller + than ``max_resources_``. min_resources_ : int The amount of resources that are allocated for each candidate at the @@ -459,14 +465,15 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): n_required_iterations_)``. n_possible_iterations_ : int - The number of iterations that are possible starting with ``min_resources_`` - resources and without exceeding ``max_resources_``. + The number of iterations that are possible starting with + ``min_resources_`` resources and without exceeding + ``max_resources_``. n_required_iterations_ : int The number of iterations that are required to end up with less than - ``ratio`` candidates at the last iteration, starting with ``min_resources_`` - resources. This will be smaller than ``n_possible_iterations_`` when - there isn't enough budget. + ``ratio`` candidates at the last iteration, starting with + ``min_resources_`` resources. This will be smaller than + ``n_possible_iterations_`` when there isn't enough budget. cv_results_ : dict of numpy (masked) ndarrays A dict with keys as column headers and values as columns, that can be @@ -728,11 +735,10 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): more than ``ratio`` candidates. force_exhaust_resources : bool, default=False - If True, then ``min_resources`` is set to a specific value such that the - last iteration uses as much budget as possible. Namely, the last - iteration uses the highest value smaller than ``max_resources`` that is a - multiple of both ``min_resources`` and ``ratio``. - + If True, then ``min_resources`` is set to a specific value such that + the last iteration uses as much budget as possible. Namely, the last + iteration uses the highest value smaller than ``max_resources`` that + is a multiple of both ``min_resources`` and ``ratio``. Attributes ---------- n_candidates_ : int @@ -746,8 +752,8 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): max_resources_ : int The maximum number of resources that any candidate is allowed to use for a given iteration. Note that since the number of resources used at - each iteration must be a multiple of ``min_resources_``, the actual number of - resources used at the last iteartion may be smaller than + each iteration must be a multiple of ``min_resources_``, the actual + number of resources used at the last iteartion may be smaller than ``max_resources_``. min_resources_ : int @@ -761,14 +767,15 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): n_required_iterations_)``. n_possible_iterations_ : int - The number of iterations that are possible starting with ``min_resources_`` - resources and without exceeding ``max_resources_``. + The number of iterations that are possible starting with + ``min_resources_`` resources and without exceeding + ``max_resources_``. n_required_iterations_ : int The number of iterations that are required to end up with less than - ``ratio`` candidates at the last iteration, starting with ``min_resources_`` - resources. This will be smaller than ``n_possible_iterations_`` when - there isn't enough budget. + ``ratio`` candidates at the last iteration, starting with + ``min_resources_`` resources. This will be smaller than + ``n_possible_iterations_`` when there isn't enough budget. cv_results_ : dict of numpy (masked) ndarrays A dict with keys as column headers and values as columns, that can be @@ -872,7 +879,7 @@ def __init__(self, estimator, param_distributions, verbose=0, cv=5, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=True, max_resources='auto', - min_resources='auto', resource='n_samples', ratio=3, + min_resources='auto', resource='n_samples', ratio=3, aggressive_elimination=False, force_exhaust_resources=False): super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index af31640f80d57..0402508c045db 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -159,8 +159,8 @@ def test_force_exhaust_resources_false(): ]) def test_force_exhaust_budget_true(max_resources, r_i_list): # Test the force_exhaust_resources parameter when it's true - # in this case we need to change min_resources so that the last iteration uses as - # much budget as possible + # in this case we need to change min_resources so that the last iteration + # uses as much budget as possible n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) @@ -195,7 +195,8 @@ def test_force_exhaust_budget_true(max_resources, r_i_list): (32, 4, 4), (31, 3, 3), (16, 3, 3), - (4, 1, 1), # max_resources == min_resources, only one iteration is possible + (4, 1, 1), # max_resources == min_resources, only one iteration is + # possible ]) def test_n_iterations(max_resources, n_iterations, n_possible_iterations): # test the number of actual iterations that were run depending on @@ -265,9 +266,9 @@ def test_random_search(max_resources, n_candidates, expected_n_candidates_): parameters = {'a': norm, 'b': norm} base_estimator = FastClassifier() sh = HalvingRandomSearchCV(base_estimator, parameters, - n_candidates=n_candidates, - cv=2, - max_resources=max_resources, ratio=2, min_resources=4) + n_candidates=n_candidates, cv=2, + max_resources=max_resources, ratio=2, + min_resources=4) sh.fit(X, y) assert sh.n_candidates_ == expected_n_candidates_ if n_candidates == 'auto': From 935525b0930017b1dcf1ec7624afe33b84f5499e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 31 Jul 2019 17:13:49 -0400 Subject: [PATCH 24/89] r_i -> resource_iter --- doc/modules/grid_search.rst | 36 +++++++++---------- .../_search_successive_halving.py | 30 +++++++++------- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index e04504c97ac06..1727607d097a6 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -237,11 +237,11 @@ classification problems. Depending on the setting, the default valueof Amount of resource and number of candidates at each iteration ------------------------------------------------------------- -The amount of resources ``r_i`` (e.g. the number of samples) allocated for +The amount of resources ``resource_iter`` (e.g. the number of samples) allocated for each candidate at iteration ``i`` is controlled by the parameters ``ratio`` and ``min_resources`` as follows:: - r_i = ratio**i * min_resources + resource_iter = ratio**i * min_resources ``min_resources`` is the amount of resources used at the first iteration and ``ratio`` defines the proportions of candidates that will be selected for @@ -260,7 +260,7 @@ Here is an example with ``min_resources=3`` and ``ratio=2``, starting with 70 candidates: +-------------+-----------------------+ -| ``r_i`` | ``n_candidates_at_i`` | +| ``resource_iter`` | ``n_candidates_at_i`` | +=============+=======================+ | 3 (=min_resources) | 70 (=n_candidates) | +-------------+-----------------------+ @@ -276,12 +276,12 @@ candidates: +-------------+-----------------------+ Ideally, at the last iteration, ``ratio`` candidates are evaluated, and we -can pick the best one. Note that each ``r_i`` is a multiple of both +can pick the best one. Note that each ``resource_iter`` is a multiple of both ``ratio`` and ``min_resources``. The amount of resource that is used at each iteration can be found using the `cv_results_` after converting it to a dataframe: -`results.groupby('iter')['r_i'].unique()` +`results.groupby('iter')['resource_iter'].unique()` Choosing a resource to budget ----------------------------- @@ -330,12 +330,12 @@ used):: >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, ... ratio=2).fit(X, y) >>> results = pd.DataFrame(sh.cv_results_) - >>> results.groupby('iter')['r_i'].unique() + >>> results.groupby('iter')['resource_iter'].unique() iter 0 [20] 1 [40] 2 [80] - Name: r_i, dtype: object + Name: resource_iter, dtype: object The search process will only use 80 resources at most, while our maximum amount of available resources is ``n_samples=1000``. Note in this case that @@ -347,12 +347,12 @@ parameter.:: ... ratio=2, force_exhaust_resources=True, ... ).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) - >>> results.groupby('iter')['r_i'].unique() + >>> results.groupby('iter')['resource_iter'].unique() iter 0 [250] 1 [500] 2 [1000] - Name: r_i, dtype: object + Name: resource_iter, dtype: object `min_resources` was here automatically set to 250, which results in the last iteration using all the resources. Since ``force_exhaust_resources`` chooses an @@ -381,16 +381,16 @@ to evaluate more than ``ratio`` candidates:: ... ratio=2, max_resources=40, ... aggressive_elimination=False).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) - >>> results.groupby('iter').r_i.unique() + >>> results.groupby('iter').resource_iter.unique() iter 0 [20] 1 [40] - Name: r_i, dtype: object - >>> results.groupby('iter').r_i.count() # number of candidates used at each iteration + Name: resource_iter, dtype: object + >>> results.groupby('iter').resource_iter.count() # number of candidates used at each iteration iter 0 6 1 3 - Name: r_i, dtype: int64 + Name: resource_iter, dtype: int64 Since we cannot use more than ``max_resources=40`` resources, the process has to stop at the second iteration which evaluates more than ``ratio=2`` candidates. @@ -406,21 +406,21 @@ necessary using ``min_resources`` resources:: ... aggressive_elimination=True, ... ).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) - >>> results.groupby('iter').r_i.unique() + >>> results.groupby('iter').resource_iter.unique() iter 0 [20] 1 [20] 2 [40] - Name: r_i, dtype: object - >>> results.groupby('iter').r_i.count() # number of candidates used at each iteration + Name: resource_iter, dtype: object + >>> results.groupby('iter').resource_iter.count() # number of candidates used at each iteration iter 0 6 1 3 2 2 - Name: r_i, dtype: int64 + Name: resource_iter, dtype: int64 Notice that we end with 2 candidates at the last iteration since we have -eliminated enough candidates during the first iterations, using ``r_i = +eliminated enough candidates during the first iterations, using ``resource_iter = min_resources = 20``. diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 6bcdeb20500ca..4b4b71545b26e 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -219,24 +219,26 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): self.force_exhaust_resources)) print('ratio: {}'.format(self.ratio)) - self._r_i_list = [] # list of r_i for each iteration, used in tests + # list of resource_iter for each iteration, used in tests + self._r_i_list = [] for iter_i in range(n_iterations): power = iter_i # default if self.aggressive_elimination: - # this will set r_i to the initial value (i.e. the value of - # r_i at the first iteration) for as many iterations as needed - # (while candidates are being eliminated), and then go on as - # usual. + # this will set resource_iter to the initial value (i.e. the + # value of resource_iter at the first iteration) for as many + # iterations as needed (while candidates are being + # eliminated), and then go on as usual. power = max( 0, iter_i - n_required_iterations + n_possible_iterations ) - r_i = int(self.ratio**power * self.min_resources_) - r_i = min(r_i, self.max_resources_) # guard, probably not needed - self._r_i_list.append(r_i) + resource_iter = int(self.ratio**power * self.min_resources_) + # guard, probably not needed + resource_iter = min(resource_iter, self.max_resources_) + self._r_i_list.append(resource_iter) n_candidates = len(candidate_params) @@ -244,7 +246,7 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): print('-' * 10) print('iter_i: {}'.format(iter_i)) print('n_candidates: {}'.format(n_candidates)) - print('r_i: {}'.format(r_i)) + print('resource_iter: {}'.format(resource_iter)) if self.resource == 'n_samples': # Subsample X and y as well as fit_params @@ -252,21 +254,23 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): fit_params = OrderedDict(fit_params) X_iter, y_iter, *fit_params_iter_list = resample( X, y, *fit_params.values(), replace=False, - random_state=rng, stratify=stratify, n_samples=r_i) + random_state=rng, stratify=stratify, + n_samples=resource_iter) fit_params_iter = { key: fit_params_iter_list[i] for (i, key) in enumerate(fit_params.keys()) } else: - # Need copy so that r_i of next iteration does not overwrite + # Need copy so that the resource_iter of next iteration does + # not overwrite candidate_params = [c.copy() for c in candidate_params] for candidate in candidate_params: - candidate[self.resource] = r_i + candidate[self.resource] = resource_iter X_iter, y_iter = X, y fit_params_iter = fit_params more_results = {'iter': [iter_i] * n_candidates, - 'r_i': [r_i] * n_candidates} + 'resource_iter': [resource_iter] * n_candidates} results = evaluate_candidates(candidate_params, X_iter, y_iter, more_results=more_results, **fit_params_iter) From 9ad17c605b724967efefc24336e70adf00425e7d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 1 Aug 2019 09:12:42 -0400 Subject: [PATCH 25/89] fixed r_i issues --- examples/model_selection/plot_successive_halving_iterations.py | 2 +- sklearn/model_selection/tests/test_successive_halving.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 4e0fef699eeef..2a3ee03c19185 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -41,7 +41,7 @@ values='mean_test_score') ax = mean_scores.plot(legend=False, alpha=.6) -r_i_list = results.groupby('iter').r_i.unique() +r_i_list = results.groupby('iter')['resource_iter'].unique() labels = ['{}\nn_samples={}'.format(i, r_i_list[i]) for i in range(rsh.n_iterations_)] ax.set_xticklabels(labels) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 0402508c045db..dc64d3578b573 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -227,7 +227,7 @@ def test_resource_parameter(): resource='c', max_resources=10, ratio=3) sh.fit(X, y) assert set(sh._r_i_list) == set([1, 3, 9]) - for r_i, params, param_c in zip(sh.cv_results_['r_i'], + for r_i, params, param_c in zip(sh.cv_results_['resource_iter'], sh.cv_results_['params'], sh.cv_results_['param_c']): assert r_i == params['c'] == param_c From 64bcc93c7daf833af47340525834b28c289a93a3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 23 Aug 2019 17:40:33 -0400 Subject: [PATCH 26/89] examples + removed use of word budget --- .../_search_successive_halving.py | 74 +++++++++++++++---- .../tests/test_successive_halving.py | 26 +++---- 2 files changed, 71 insertions(+), 29 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index cadbbee73409b..bfc58e9256845 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -175,7 +175,7 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): self.resource in candidate for candidate in candidate_params): # Can only check this now since we need the candidates list raise ValueError( - "Cannot budget on parameter {} since it is part of " + "Cannot use parameter {} as the resource since it is part of " "the searched parameters.".format(self.resource)) # n_required_iterations is the number of iterations needed so that the @@ -184,10 +184,10 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): self.ratio)) if self.force_exhaust_resources: - # To exhaust the budget, we want to start with the biggest + # To exhaust the resources, we want to start with the biggest # min_resources possible so that the last (required) iteration # uses as many resources as possible - # We only force exhausting the budget if min_resources wasn't + # We only force exhausting the resources if min_resources wasn't # specified by the user. last_iteration = n_required_iterations - 1 self.min_resources_ = max( @@ -197,8 +197,9 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): # n_possible_iterations is the number of iterations that we can # actually do starting from min_resources and without exceeding the - # budget. Depending on budget size the number of candidates, this may - # be higher or smaller than n_required_iterations. + # max_resources. Depending on max_resources and the number of + # candidates, this may be higher or smaller than + # n_required_iterations. n_possible_iterations = 1 + floor(log( self.max_resources_ // self.min_resources_, self.ratio)) @@ -430,7 +431,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): means that only one third of the candidates are selected. aggressive_elimination : bool, default=False - This is only relevant in cases where there isn't enough budget to + This is only relevant in cases where there isn't enough resources to eliminate enough candidates at the last iteration. If ``True``, then the search process will 'replay' the first iteration for as long as needed until the number of candidates is small enough. This is @@ -439,9 +440,9 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): force_exhaust_resources : bool, default=False If True, then ``min_resources`` is set to a specific value such that - the last iteration uses as much budget as possible. Namely, the last - iteration uses the highest value smaller than ``max_resources`` that - is a multiple of both ``min_resources`` and ``ratio``. + the last iteration uses as much resources as possible. Namely, the + last iteration uses the highest value smaller than ``max_resources`` + that is a multiple of both ``min_resources`` and ``ratio``. Attributes ---------- @@ -479,7 +480,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): The number of iterations that are required to end up with less than ``ratio`` candidates at the last iteration, starting with ``min_resources_`` resources. This will be smaller than - ``n_possible_iterations_`` when there isn't enough budget. + ``n_possible_iterations_`` when there isn't enough resources. cv_results_ : dict of numpy (masked) ndarrays A dict with keys as column headers and values as columns, that can be @@ -558,6 +559,25 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): This is present only if ``refit`` is not False. + Examples + -------- + + >>> from sklearn.datasets import load_iris + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.model_selection import HalvingGridSearchCV + ... + >>> X, y = load_iris(return_X_y=True) + >>> clf = RandomForestClassifier(random_state=0) + ... + >>> param_grid = {"max_depth": [3, None], + ... "min_samples_split": [5, 10]} + >>> search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators', + ... max_resources=10, + ... force_exhaust_resources=True, + ... random_state=0).fit(X, y) + >>> search.best_params_ # doctest: +SKIP + {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} + Notes ----- The parameters selected are those that maximize the score of the held-out @@ -733,7 +753,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): means that only one third of the candidates are selected. aggressive_elimination : bool, default=False - This is only relevant in cases where there isn't enough budget to + This is only relevant in cases where there isn't enough resources to eliminate enough candidates at the last iteration. If ``True``, then the search process will 'replay' the first iteration for as long as needed until the number of candidates is small enough. This is @@ -742,9 +762,9 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): force_exhaust_resources : bool, default=False If True, then ``min_resources`` is set to a specific value such that - the last iteration uses as much budget as possible. Namely, the last - iteration uses the highest value smaller than ``max_resources`` that - is a multiple of both ``min_resources`` and ``ratio``. + the last iteration uses as much resousrces as possible. Namely, the + last iteration uses the highest value smaller than ``max_resources`` + that is a multiple of both ``min_resources`` and ``ratio``. Attributes ---------- n_candidates_ : list of int @@ -781,7 +801,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): The number of iterations that are required to end up with less than ``ratio`` candidates at the last iteration, starting with ``min_resources_`` resources. This will be smaller than - ``n_possible_iterations_`` when there isn't enough budget. + ``n_possible_iterations_`` when there isn't enough resources. cv_results_ : dict of numpy (masked) ndarrays A dict with keys as column headers and values as columns, that can be @@ -860,6 +880,28 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): This is present only if ``refit`` is not False. + Examples + -------- + + >>> from sklearn.datasets import load_iris + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.model_selection import HalvingRandomSearchCV + >>> from scipy.stats import randint + ... + >>> X, y = load_iris(return_X_y=True) + >>> clf = RandomForestClassifier(random_state=0) + >>> np.random.seed(0) + ... + >>> param_distributions = {"max_depth": [3, None], + ... "min_samples_split": randint(2, 11)} + >>> search = HalvingRandomSearchCV(clf, param_distributions, + ... resource='n_estimators', + ... max_resources=10, + ... force_exhaust_resources=True, + ... random_state=0).fit(X, y) + >>> search.best_params_ # doctest: +SKIP + {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} + Notes ----- The parameters selected are those that maximize the score of the held-out @@ -902,7 +944,7 @@ def _generate_candidate_params(self): n_candidates_first_iter = self.n_candidates if n_candidates_first_iter == 'auto': # This will generate enough candidate so that the last iteration - # uses as much budget as possible + # uses as much resources as possible n_candidates_first_iter = ( self.max_resources_ // self.min_resources_) return ParameterSampler(self.param_distributions, diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index a35fa3f023efb..298db7b990935 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -35,7 +35,7 @@ def test_aggressive_elimination(): ratio = 3 # aggressive_elimination is only really relevant when there is not enough - # budget. + # resources. max_resources = 180 # aggressive_elimination=True @@ -77,8 +77,8 @@ def test_aggressive_elimination(): assert sh.n_remaining_candidates_ == 3 max_resources = n_samples - # with enough budget, aggressive_elimination has no effect since it is not - # needed + # with enough resources, aggressive_elimination has no effect since it is + # not needed # aggressive_elimination=True sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, @@ -116,7 +116,7 @@ def test_force_exhaust_resources_false(): base_estimator = FastClassifier() ratio = 3 - # with enough budget + # with enough resources sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, force_exhaust_resources=False, ratio=ratio) sh.fit(X, y) @@ -125,7 +125,7 @@ def test_force_exhaust_resources_false(): assert sh.n_possible_iterations_ == 4 assert sh._r_i_list == [20, 60] - # with enough budget but min_resources!='auto': ignored + # with enough resources but min_resources!='auto': ignored sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, force_exhaust_resources=False, ratio=ratio, min_resources=50) @@ -135,7 +135,7 @@ def test_force_exhaust_resources_false(): assert sh.n_possible_iterations_ == 3 assert sh._r_i_list == [50, 150] - # without enough budget (budget is exhausted anyway) + # without enough resources (resources are exhausted anyway) sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, force_exhaust_resources=False, ratio=ratio, max_resources=30) @@ -157,10 +157,10 @@ def test_force_exhaust_resources_false(): (50, [20]), (20, [20]), ]) -def test_force_exhaust_budget_true(max_resources, r_i_list): +def test_force_exhaust_resources_true(max_resources, r_i_list): # Test the force_exhaust_resources parameter when it's true # in this case we need to change min_resources so that the last iteration - # uses as much budget as possible + # uses as much resources as possible n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) @@ -187,7 +187,7 @@ def test_force_exhaust_budget_true(max_resources, r_i_list): @pytest.mark.parametrize( 'max_resources, n_iterations, n_possible_iterations', [ - ('auto', 5, 9), # whole budget is used + ('auto', 5, 9), # all resources are used (1024, 5, 9), (700, 5, 8), (512, 5, 8), @@ -241,8 +241,8 @@ def test_resource_parameter(): with pytest.raises( ValueError, - match='Cannot budget on parameter c since it is part of the ' - 'searched parameters.'): + match='Cannot use parameter c as the resource since it is part ' + 'of the searched parameters.'): parameters = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]} sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, resource='c', max_resources=10) @@ -272,8 +272,8 @@ def test_random_search(max_resources, n_candidates, expected_n_candidates_): sh.fit(X, y) assert sh.n_candidates_[0] == expected_n_candidates_ if n_candidates == 'auto': - # Make sure 'auto' makes the last iteration use as much budget as we - # can + # Make sure 'auto' makes the last iteration use as much resources as + # we can assert sh._r_i_list[-1] == max_resources From a4388909fbd5df6f1f177defc4c989be377d5902 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 23 Aug 2019 18:25:41 -0400 Subject: [PATCH 27/89] Added inpute checking tests --- .../_search_successive_halving.py | 20 ++++++---- .../tests/test_successive_halving.py | 38 +++++++++++++++++++ 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index bfc58e9256845..0c0628cdad4a4 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -1,6 +1,7 @@ from math import ceil, floor, log from abc import abstractmethod from collections import OrderedDict +from numbers import Integral import numpy as np from ._search import _check_param_grid @@ -66,7 +67,8 @@ def _check_input_parameters(self, X, y, groups): if self.scoring is not None and not (isinstance(self.scoring, str) or callable(self.scoring)): raise ValueError('scoring parameter must be a string, ' - 'a callable or None.') + 'a callable or None. Multimetric scoring is not ' + 'supported.') if (self.resource != 'n_samples' and self.resource not in self.estimator.get_params()): @@ -78,22 +80,26 @@ def _check_input_parameters(self, X, y, groups): if (isinstance(self.max_resources, str) and self.max_resources != 'auto'): raise ValueError( - "max_resources must be either 'auto' or a positive number" + "max_resources must be either 'auto' or a positive integer" ) - if self.max_resources != 'auto' and self.max_resources <= 0: + if self.max_resources != 'auto' and ( + not isinstance(self.max_resources, Integral) or + self.max_resources <= 0): raise ValueError( - "max_resources must be either 'auto' or a positive number" + "max_resources must be either 'auto' or a positive integer" ) if (isinstance(self.min_resources, str) and self.min_resources != 'auto'): raise ValueError( - "min_resources must be either 'auto' or a positive number " + "min_resources must be either 'auto' or a positive integer " "no greater than max_resources." ) - if self.min_resources != 'auto' and self.min_resources <= 0: + if self.min_resources != 'auto' and ( + not isinstance(self.min_resources, Integral) or + self.min_resources <= 0): raise ValueError( - "min_resources must be either 'auto' or a positive number " + "min_resources must be either 'auto' or a positive integer " "no greater than max_resources." ) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 298db7b990935..72a5301c0d015 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -286,3 +286,41 @@ def test_groups_not_supported(): with pytest.raises(ValueError, match="groups are not supported"): sh.fit(X, y, groups) + + +@pytest.mark.parametrize('klass', (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize('params, expected_error_message', [ + ({'scoring': {'accuracy', 'accuracy'}}, + 'Multimetric scoring is not supp'), + ({'resource': 'not_a_parameter'}, + 'Cannot use resource'), + ({'resource': 'a', 'max_resources': 100}, + 'Cannot use parameter a as the resource since it is part of'), + ({'max_resources': 'not_auto'}, + 'max_resources must be either'), + ({'max_resources': 100.5}, + 'max_resources must be either'), + ({'max_resources': -10}, + 'max_resources must be either'), + ({'min_resources': 'not_auto'}, + 'min_resources must be either'), + ({'min_resources': 0.5}, + 'min_resources must be either'), + ({'min_resources': -10}, + 'min_resources must be either'), + ({'force_exhaust_resources': True, 'min_resources': 5}, + 'min_resources must be set to auto if '), + ({'max_resources': 'auto', 'resource': 'b'}, + "max_resources can only be 'auto' if resource='n_samples'"), + ({'min_resources': 15, 'max_resources': 14}, + "min_resources_=15 is greater than max_resources_=14"), +]) +def test_input_errors(klass, params, expected_error_message): + base_estimator = FastClassifier() + param_grid = {'a': [1]} + X, y = make_classification(100) + + sh = klass(base_estimator, param_grid, **params) + + with pytest.raises(ValueError, match=expected_error_message): + sh.fit(X, y) From 98161b32431e6641fbffb85983bc7ad7a142b6b7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 23 Aug 2019 19:40:07 -0400 Subject: [PATCH 28/89] added cv_resutlts_ user guide --- doc/modules/grid_search.rst | 37 ++++++++ .../_search_successive_halving.py | 95 ++----------------- .../tests/test_successive_halving.py | 4 +- 3 files changed, 48 insertions(+), 88 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index bcec1b74e526f..944423fc4927f 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -423,6 +423,43 @@ Notice that we end with 2 candidates at the last iteration since we have eliminated enough candidates during the first iterations, using ``resource_iter = min_resources = 20``. +.. _successive_halving_cv_results: + +The cv_results_ attribute +------------------------- + +The ``cv_results_`` attribute contains useful information for analysing the +results of a search. It can be converted to a pandas dataframe with ``df = +pd.DataFrame(est.cv_results_)``. + +Here is an example with some of the columns of a (truncated) dataframe: + +==== ====== =============== ================= ======================================================================================= + .. iter resource_iter mean_test_score params +==== ====== =============== ================= ======================================================================================= + 0 0 125 0.983667 {'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 5} + 1 0 125 0.983667 {'criterion': 'gini', 'max_depth': None, 'max_features': 8, 'min_samples_split': 7} + 2 0 125 0.983667 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 10} + 3 0 125 0.983667 {'criterion': 'entropy', 'max_depth': None, 'max_features': 6, 'min_samples_split': 6} + ... ... ... ... ... + 15 2 500 0.951958 {'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 10} + 16 2 500 0.947958 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 10} + 17 2 500 0.951958 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 4} + 18 3 1000 0.961009 {'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 10} + 19 3 1000 0.955989 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 4} +==== ====== =============== ================= ======================================================================================= + +Each row corresponds to a given parameter combination (a candidate) and a given +iteration. The iteration is given by the ``iter`` column. The ``resource_iter`` +column tells you how many resources were used. + +In the example above, the best parameter combination is ``{'criterion': +'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 10}`` +since it has reached the last iteration (3) with the highest score: +0.96. + +Please note that the ``cv_results_`` attributes has much more columns that what +is shown here. .. topic:: References: diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 0c0628cdad4a4..41c384ac31218 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -490,49 +490,11 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): cv_results_ : dict of numpy (masked) ndarrays A dict with keys as column headers and values as columns, that can be - imported into a pandas ``DataFrame``. - - For instance the below given table - - +--------------+-------------+-------------------+---+---------------+ - | param_kernel | param_gamma | split0_test_score |...|rank_test_score| - +==============+=============+===================+===+===============+ - | 'rbf' | 0.1 | 0.80 |...| 2 | - +--------------+-------------+-------------------+---+---------------+ - | 'rbf' | 0.2 | 0.90 |...| 1 | - +--------------+-------------+-------------------+---+---------------+ - | 'rbf' | 0.3 | 0.70 |...| 1 | - +--------------+-------------+-------------------+---+---------------+ - - will be represented by a ``cv_results_`` dict of:: - - { - 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'], - mask = False), - 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False), - 'split0_test_score' : [0.80, 0.90, 0.70], - 'split1_test_score' : [0.82, 0.50, 0.70], - 'mean_test_score' : [0.81, 0.70, 0.70], - 'std_test_score' : [0.01, 0.20, 0.00], - 'rank_test_score' : [3, 1, 1], - 'split0_train_score' : [0.80, 0.92, 0.70], - 'split1_train_score' : [0.82, 0.55, 0.70], - 'mean_train_score' : [0.81, 0.74, 0.70], - 'std_train_score' : [0.01, 0.19, 0.00], - 'mean_fit_time' : [0.73, 0.63, 0.43], - 'std_fit_time' : [0.01, 0.02, 0.01], - 'mean_score_time' : [0.01, 0.06, 0.04], - 'std_score_time' : [0.00, 0.00, 0.00], - 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...], - } - - NOTE - - The key ``'params'`` is used to store a list of parameter - settings dicts for all the parameter candidates. - - The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and - ``std_score_time`` are all in seconds. + imported into a pandas ``DataFrame``. It contains many informations for + analysing the results of a search. + Please refer to the :ref:`User guide` + for details. + best_estimator_ : estimator or dict Estimator that was chosen by the search, i.e. estimator @@ -811,49 +773,10 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): cv_results_ : dict of numpy (masked) ndarrays A dict with keys as column headers and values as columns, that can be - imported into a pandas ``DataFrame``. - - For instance the below given table - - +--------------+-------------+-------------------+---+---------------+ - | param_kernel | param_gamma | split0_test_score |...|rank_test_score| - +==============+=============+===================+===+===============+ - | 'rbf' | 0.1 | 0.80 |...| 2 | - +--------------+-------------+-------------------+---+---------------+ - | 'rbf' | 0.2 | 0.90 |...| 1 | - +--------------+-------------+-------------------+---+---------------+ - | 'rbf' | 0.3 | 0.70 |...| 1 | - +--------------+-------------+-------------------+---+---------------+ - - will be represented by a ``cv_results_`` dict of:: - - { - 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'], - mask = False), - 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False), - 'split0_test_score' : [0.80, 0.90, 0.70], - 'split1_test_score' : [0.82, 0.50, 0.70], - 'mean_test_score' : [0.81, 0.70, 0.70], - 'std_test_score' : [0.01, 0.20, 0.00], - 'rank_test_score' : [3, 1, 1], - 'split0_train_score' : [0.80, 0.92, 0.70], - 'split1_train_score' : [0.82, 0.55, 0.70], - 'mean_train_score' : [0.81, 0.74, 0.70], - 'std_train_score' : [0.01, 0.19, 0.00], - 'mean_fit_time' : [0.73, 0.63, 0.43], - 'std_fit_time' : [0.01, 0.02, 0.01], - 'mean_score_time' : [0.01, 0.06, 0.04], - 'std_score_time' : [0.00, 0.00, 0.00], - 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...], - } - - NOTE - - The key ``'params'`` is used to store a list of parameter - settings dicts for all the parameter candidates. - - The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and - ``std_score_time`` are all in seconds. + imported into a pandas ``DataFrame``. It contains many informations for + analysing the results of a search. + Please refer to the :ref:`User guide` + for details. best_estimator_ : estimator or dict Estimator that was chosen by the search, i.e. estimator diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 72a5301c0d015..ba50bca44d9ba 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -291,9 +291,9 @@ def test_groups_not_supported(): @pytest.mark.parametrize('klass', (HalvingGridSearchCV, HalvingRandomSearchCV)) @pytest.mark.parametrize('params, expected_error_message', [ ({'scoring': {'accuracy', 'accuracy'}}, - 'Multimetric scoring is not supp'), + 'Multimetric scoring is not supported'), ({'resource': 'not_a_parameter'}, - 'Cannot use resource'), + 'Cannot use resource=not_a_parameter which is not supported'), ({'resource': 'a', 'max_resources': 100}, 'Cannot use parameter a as the resource since it is part of'), ({'max_resources': 'not_auto'}, From 19243d6d32640d67c77ee3a4e3adef647aa9e8ae Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 23 Aug 2019 19:43:17 -0400 Subject: [PATCH 29/89] minor title change --- doc/modules/grid_search.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 944423fc4927f..b6f5cc5ea3d09 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -425,8 +425,8 @@ min_resources = 20``. .. _successive_halving_cv_results: -The cv_results_ attribute -------------------------- +Analysing results with the cv_results_ attribute +------------------------------------------------ The ``cv_results_`` attribute contains useful information for analysing the results of a search. It can be converted to a pandas dataframe with ``df = From 5203a304fbcf24158ee64764258b9c8e78405dbb Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 20 Sep 2019 09:17:19 -0400 Subject: [PATCH 30/89] fixed doc layout --- sklearn/model_selection/_search_successive_halving.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 41c384ac31218..7ffc30691facf 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -733,6 +733,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): the last iteration uses as much resousrces as possible. Namely, the last iteration uses the highest value smaller than ``max_resources`` that is a multiple of both ``min_resources`` and ``ratio``. + Attributes ---------- n_candidates_ : list of int From 9f049ec3c41a2d6e71d7d8a536d4fdaded3ccec0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 11 Nov 2019 16:42:21 -0500 Subject: [PATCH 31/89] Addressed some comments --- doc/modules/grid_search.rst | 75 ++++--- .../plot_successive_halving_heatmap.py | 9 +- .../plot_successive_halving_iterations.py | 8 +- sklearn/model_selection/_search.py | 10 +- .../_search_successive_halving.py | 13 +- .../tests/test_successive_halving.py | 209 ++++++++---------- 6 files changed, 157 insertions(+), 167 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index a4008f2015473..c94c784f3314d 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -199,6 +199,10 @@ The ``ratio`` parameter controls the rate at which the resources grow, and the rate at which the number of candidate decreases (more details in :ref:`amount_of_resource_and_number_of_candidates`) +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_model_selection_plot_successive_halving_heatmap.py` + * :ref:`sphx_glr_auto_examples_model_selection_plot_successive_halving_iterations.py` Choosing ``min_resources`` and the number of candidates ------------------------------------------------------- @@ -238,7 +242,7 @@ the distinction is clear even with a small amount of samples, then a small By default, ``min_resources`` is set to a small value (see docstrings for details) that depends on the number of folds, and the number of classes for -classification problems. Depending on the setting, the default valueof +classification problems. Depending on the setting, the default value of ``min_resources`` might not be ideal. .. note:: @@ -254,27 +258,29 @@ classification problems. Depending on the setting, the default valueof Amount of resource and number of candidates at each iteration ------------------------------------------------------------- -The amount of resources ``resource_iter`` (e.g. the number of samples) allocated for -each candidate at iteration ``i`` is controlled by the parameters ``ratio`` -and ``min_resources`` as follows:: +The amount of resources ``resource_iter`` (e.g. the number of samples) +allocated for each candidate at iteration ``i`` is controlled by the +parameters ``ratio`` and ``min_resources`` as follows:: resource_iter = ratio**i * min_resources ``min_resources`` is the amount of resources used at the first iteration and ``ratio`` defines the proportions of candidates that will be selected for -the next iteration:: +the next iteration (``ratio`` must be greater than 1):: n_candidates_to_keep = n_candidates_at_i // ratio -So in the first iteration, we use ``min_resources`` resources ``n_candidates`` -times. In the second iteration, we use ``min_resources * ratio`` resources -``n_candidates // ratio`` times. The third again multiplies the resources -per candidate and divides the number of candidates. This process stops when -the maximum amount of resource per candidate is reached, or when less than -``ratio`` candidates are left. +So in the first iteration, we use ``min_resources`` resources +``n_candidates`` times. In the second iteration, we use ``min_resources * +ratio`` resources ``n_candidates // ratio`` times. The third again +multiplies the resources per candidate and divides the number of candidates. +This process stops when the maximum amount of resource per candidate is +reached, or when we have identified the best candidate. The best candidate +is identified at the iteration that is evaluating `ratio` or less candidates +(see below). -Here is an example with ``min_resources=3`` and ``ratio=2``, starting with 70 -candidates: +Here is an example with ``min_resources=3`` and ``ratio=2``, starting with +70 candidates: +-------------+-----------------------+ | ``resource_iter`` | ``n_candidates_at_i`` | @@ -292,13 +298,20 @@ candidates: | 48 * 2 = 96 | 4 // 2 = 2 | +-------------+-----------------------+ -Ideally, at the last iteration, ``ratio`` candidates are evaluated, and we -can pick the best one. Note that each ``resource_iter`` is a multiple of both -``ratio`` and ``min_resources``. +We can note that: + +- the process stops at the first iteration which evaluates `ratio=2` + candidates: the best candidate is the best out of these 2 candidates. It + is not necessary to run an additional iteration, since it would only + evaluate one candidate (namely the best one, which we have already + identified). +- each ``resource_iter`` is a multiple of both ``ratio`` and + ``min_resources`` (which is confirmed by its definition above). -The amount of resource that is used at each iteration can be found using the -`cv_results_` after converting it to a dataframe: -`results.groupby('iter')['resource_iter'].unique()` +The amount of resources that is used at each iteration can be found using +the `cv_results_` attribute after converting it to a dataframe: +`results.groupby('iter')['resource_iter'].unique()`, as done e.g. in +:ref:`sphx_glr_auto_examples_model_selection_plot_successive_halving_iterations.py` Choosing a resource to budget ----------------------------- @@ -333,7 +346,7 @@ Exhausting the available resources ---------------------------------- As mentioned above, the first iteration uses ``min_resources`` resources. If -you have a lots of resources available, some of them might be wasted (not +you have a lot of resources available, some of them might be wasted (not used):: >>> from sklearn.datasets import make_classification @@ -379,10 +392,11 @@ to 'auto' (default). Aggressive elimination of candidates ------------------------------------ -Ideally, we want the last iteration to evaluate ``ratio`` candidates. We then -just have to pick the best one. When the number of available resources is -small with respect to the number of candidates, the last iteration may have -to evaluate more than ``ratio`` candidates:: +Ideally, we want the last iteration to evaluate ``ratio`` candidates (see +:ref:`amount_of_resource_and_number_of_candidates`). We then just have to +pick the best one. When the number of available resources is small with +respect to the number of candidates, the last iteration may have to evaluate +more than ``ratio`` candidates:: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC @@ -409,8 +423,9 @@ to evaluate more than ``ratio`` candidates:: 1 3 Name: resource_iter, dtype: int64 -Since we cannot use more than ``max_resources=40`` resources, the process has to -stop at the second iteration which evaluates more than ``ratio=2`` candidates. +Since we cannot use more than ``max_resources=40`` resources, the process +has to stop at the second iteration which evaluates more than ``ratio=2`` +candidates. Using the ``aggressive_elimination`` parameter, you can force the search process to end up with less than ``ratio`` candidates at the last @@ -447,7 +462,10 @@ Analysing results with the cv_results_ attribute The ``cv_results_`` attribute contains useful information for analysing the results of a search. It can be converted to a pandas dataframe with ``df = -pd.DataFrame(est.cv_results_)``. +pd.DataFrame(est.cv_results_)``. The ``cv_results_`` attribute of +:class:`HalvingGridSearchCV` and :class:`HalvingRandomSearchCV` is similar +to that of :class:`GridSearchCV` and :class:`RandomizedSearchCV`>, with +additional information related to the successive halving process. Here is an example with some of the columns of a (truncated) dataframe: @@ -475,9 +493,6 @@ In the example above, the best parameter combination is ``{'criterion': since it has reached the last iteration (3) with the highest score: 0.96. -Please note that the ``cv_results_`` attributes has much more columns that what -is shown here. - .. topic:: References: .. [1] K. Jamieson, A. Talwalkar, diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index 711014e9772aa..9f9f8d2edad19 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -14,11 +14,10 @@ the ones with ``5`` are the parameter combinations that are considered the best ones. -The -:class:`HalvingGridSearchCV ` -class is able to find parameter combinations that are just as accurate as -:class:`GridSearchCV `, in much less -time. +We can see that the :class:`HalvingGridSearchCV +` class is able to find +parameter combinations that are just as accurate as :class:`GridSearchCV +`, in much less time. """ from time import time diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 098e88cf9a818..02ab43d47ea43 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -2,11 +2,11 @@ Successive Halving Iterations ============================= -This example illustrates how +This example illustrates how a successive halving search ( :class:`HalvingGridSearchCV ` -and -:class:`HalvingRandomSearchCV ` -selectively choose the best parameter combination out of multiple candidates. +and :class:`HalvingRandomSearchCV +`) selectively chooses the +best parameter combination out of multiple candidates. At the first iteration, a small amount of resources is used. The resource here is the number of samples that the estimators are trained on. All candidates are diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 8070a799549ed..a1f1645a5cd6a 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -566,7 +566,7 @@ def classes_(self): self._check_is_fitted("classes_") return self.best_estimator_.classes_ - def _run_search(self, evaluate_candidates, X, y): + def _run_search(self, evaluate_candidates, X, y, **fit_params): """Repeatedly calls `evaluate_candidates` to conduct a search. This method, implemented in sub-classes, makes it possible to @@ -591,12 +591,14 @@ def _run_search(self, evaluate_candidates, X, y): :: - def _run_search(self, evaluate_candidates, X, y): + def _run_search(self, evaluate_candidates, X, y, **fit_params): 'Try C=0.1 only if C=1 is better than C=10' - all_results = evaluate_candidates([{'C': 1}, {'C': 10}], X, y) + all_results = evaluate_candidates([{'C': 1}, {'C': 10}], X, y, + **fit_params) score = all_results['mean_test_score'] if score[0] < score[1]: - evaluate_candidates([{'C': 0.1}], X, y) + evaluate_candidates([{'C': 0.1}], X, y, + **fit_params) """ raise NotImplementedError("_run_search not implemented.") diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 7ffc30691facf..5f6eb384d8636 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -596,6 +596,9 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): amount of resources and iteratively selects the best candidates, using more and more resources. + The candidates are sampled at random from the parameter space and the + number of sampled candidates is determined by ``n_candidates``. + Read more in the :ref:`User guide`. Parameters @@ -612,10 +615,10 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): If a list is given, it is sampled uniformly. n_candidates: int, default='auto' - The number of candidate parameters to sample. By default this will - sample enough candidates so that the last iteration uses as many - resources as possible. Note that ``force_exhaust_resources`` has no - effect in this case. + The number of candidate parameters to sample, at the first + iteration. By default this will sample enough candidates so that the + last iteration uses as many resources as possible. Note that + ``force_exhaust_resources`` has no effect in this case. scoring : string, callable, or None, default=None A single string (see :ref:`scoring_parameter`) or a callable @@ -730,7 +733,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): force_exhaust_resources : bool, default=False If True, then ``min_resources`` is set to a specific value such that - the last iteration uses as much resousrces as possible. Namely, the + the last iteration uses as much resources as possible. Namely, the last iteration uses the highest value smaller than ``max_resources`` that is a multiple of both ``min_resources`` and ``ratio``. diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index ba50bca44d9ba..7d955bd78852b 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -25,127 +25,100 @@ def get_params(self, deep=False): return params -def test_aggressive_elimination(): +@pytest.mark.parametrize('klass', (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize( + ('aggressive_elimination,' + 'max_resources,' + 'expected_n_iterations,' + 'expected_n_required_iterations,' + 'expected_n_possible_iterations,' + 'expected_n_remaining_candidates,' + 'expected_r_i_list,'), [ + # notice how it loops at the beginning + (True, 'small', 4, 4, 3, 1, [20, 20, 60, 180]), + # no aggressive elimination: we end up with less iterations and more + # candidates at the end + (False, 'small', 3, 4, 3, 3, [20, 60, 180]), + # When the amount of resource isn't limited, aggressive_elimination + # doesn't matter. + (True, 'high', 4, 4, 4, 1, [20, 60, 180, 540]), + (False, 'high', 4, 4, 4, 1, [20, 60, 180, 540]), + ] +) +def test_aggressive_elimination( + klass, aggressive_elimination, max_resources, expected_n_iterations, + expected_n_required_iterations, expected_n_possible_iterations, + expected_n_remaining_candidates, expected_r_i_list): # Test the aggressive_elimination parameter. n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': ('l1', 'l2'), 'b': list(range(30))} base_estimator = FastClassifier() - ratio = 3 - - # aggressive_elimination is only really relevant when there is not enough - # resources. - max_resources = 180 - # aggressive_elimination=True - # In this case, the first iterations only use min_resources_ resources - sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - aggressive_elimination=True, - max_resources=max_resources, ratio=ratio) - sh.fit(X, y) - assert sh.n_iterations_ == 4 - assert sh.n_required_iterations_ == 4 - assert sh.n_possible_iterations_ == 3 - assert sh._r_i_list == [20, 20, 60, 180] # see how it loops at the start - assert sh.n_remaining_candidates_ == 1 + if max_resources == 'small': + max_resources = 180 + else: + max_resources = n_samples - # Make sure we get the same results with randomized search - sh = HalvingRandomSearchCV(base_estimator, parameters, - n_candidates=60, cv=5, - aggressive_elimination=True, - max_resources=max_resources, ratio=ratio) - sh.fit(X, y) - assert sh.n_iterations_ == 4 - assert sh.n_required_iterations_ == 4 - assert sh.n_possible_iterations_ == 3 - assert sh._r_i_list == [20, 20, 60, 180] # see how it loops at the start - assert sh.n_remaining_candidates_ == 1 - - # aggressive_elimination=False - # In this case we don't loop at the start, and might end up with a lot of - # candidates at the last iteration - sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - aggressive_elimination=False, - max_resources=max_resources, ratio=ratio) - sh.fit(X, y) - - assert sh.n_iterations_ == 3 - assert sh.n_required_iterations_ == 4 - assert sh.n_possible_iterations_ == 3 - assert sh._r_i_list == [20, 60, 180] - assert sh.n_remaining_candidates_ == 3 - - max_resources = n_samples - # with enough resources, aggressive_elimination has no effect since it is - # not needed - - # aggressive_elimination=True - sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - aggressive_elimination=True, - max_resources=max_resources, ratio=ratio) - sh.fit(X, y) + sh = klass(base_estimator, parameters, + aggressive_elimination=aggressive_elimination, + max_resources=max_resources, ratio=3) - assert sh.n_iterations_ == 4 - assert sh.n_required_iterations_ == 4 - assert sh.n_possible_iterations_ == 4 - assert sh._r_i_list == [20, 60, 180, 540] - assert sh.n_remaining_candidates_ == 1 + if klass is HalvingRandomSearchCV: + sh.set_params(n_candidates=2 * 30) # same number as with the grid - # aggressive_elimination=False - sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - aggressive_elimination=False, - max_resources=max_resources, ratio=ratio) sh.fit(X, y) - assert sh.n_iterations_ == 4 - assert sh.n_required_iterations_ == 4 - assert sh.n_possible_iterations_ == 4 - assert sh._r_i_list == [20, 60, 180, 540] - assert sh.n_remaining_candidates_ == 1 + assert sh.n_iterations_ == expected_n_iterations + assert sh.n_required_iterations_ == expected_n_required_iterations + assert sh.n_possible_iterations_ == expected_n_possible_iterations + assert sh._r_i_list == expected_r_i_list + assert sh.n_remaining_candidates_ == expected_n_remaining_candidates -def test_force_exhaust_resources_false(): +@pytest.mark.parametrize('klass', (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize( + ('min_resources,' + 'max_resources,' + 'expected_n_iterations,' + 'expected_n_required_iterations,' + 'expected_n_possible_iterations,' + 'expected_r_i_list,'), [ + # with enough resources + ('auto', 'auto', 2, 2, 4, [20, 60]), + # with enough resources but min_resources!='auto': ignored + (50, 'auto', 2, 2, 3, [50, 150]), + # without enough resources (resources are exhausted anyway) + ('auto', 30, 1, 2, 1, [20]), + ] +) +def test_force_exhaust_resources_false( + klass, min_resources, max_resources, expected_n_iterations, + expected_n_required_iterations, expected_n_possible_iterations, + expected_r_i_list): # Test the force_exhaust_resources parameter when it's false or ignored. # This is the default case: we start at the beginning no matter what since # we do not overwrite min_resources_ - n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': [1, 2], 'b': [1, 2, 3]} base_estimator = FastClassifier() - ratio = 3 - # with enough resources - sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - force_exhaust_resources=False, ratio=ratio) - sh.fit(X, y) - assert sh.n_iterations_ == 2 - assert sh.n_required_iterations_ == 2 - assert sh.n_possible_iterations_ == 4 - assert sh._r_i_list == [20, 60] - - # with enough resources but min_resources!='auto': ignored - sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - force_exhaust_resources=False, ratio=ratio, - min_resources=50) - sh.fit(X, y) - assert sh.n_iterations_ == 2 - assert sh.n_required_iterations_ == 2 - assert sh.n_possible_iterations_ == 3 - assert sh._r_i_list == [50, 150] - - # without enough resources (resources are exhausted anyway) - sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - force_exhaust_resources=False, ratio=ratio, - max_resources=30) + sh = klass(base_estimator, parameters, force_exhaust_resources=False, + ratio=3, min_resources=min_resources, + max_resources=max_resources) + if klass is HalvingRandomSearchCV: + sh.set_params(n_candidates=6) # same number as with the grid + sh.fit(X, y) - assert sh.n_iterations_ == 1 - assert sh.n_required_iterations_ == 2 - assert sh.n_possible_iterations_ == 1 - assert sh._r_i_list == [20] + assert sh.n_iterations_ == expected_n_iterations + assert sh.n_required_iterations_ == expected_n_required_iterations + assert sh.n_possible_iterations_ == expected_n_possible_iterations + assert sh._r_i_list == expected_r_i_list +@pytest.mark.parametrize('klass', (HalvingRandomSearchCV, HalvingGridSearchCV)) @pytest.mark.parametrize('max_resources, r_i_list', [ ('auto', [333, 999]), (1000, [333, 999]), @@ -157,7 +130,7 @@ def test_force_exhaust_resources_false(): (50, [20]), (20, [20]), ]) -def test_force_exhaust_resources_true(max_resources, r_i_list): +def test_force_exhaust_resources_true(klass, max_resources, r_i_list): # Test the force_exhaust_resources parameter when it's true # in this case we need to change min_resources so that the last iteration # uses as much resources as possible @@ -166,25 +139,18 @@ def test_force_exhaust_resources_true(max_resources, r_i_list): X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': [1, 2], 'b': [1, 2, 3]} base_estimator = FastClassifier() - ratio = 3 - sh = HalvingGridSearchCV(base_estimator, parameters, cv=5, - force_exhaust_resources=True, ratio=ratio, - max_resources=max_resources) - sh.fit(X, y) - - assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh._r_i_list) - assert sh._r_i_list == r_i_list - # Test same for randomized search - sh = HalvingRandomSearchCV(base_estimator, parameters, n_candidates=6, - cv=5, force_exhaust_resources=True, - ratio=ratio, max_resources=max_resources) + sh = klass(base_estimator, parameters, force_exhaust_resources=True, + ratio=3, max_resources=max_resources) + if klass is HalvingRandomSearchCV: + sh.set_params(n_candidates=6) # same as for HalvingGridSearchCV sh.fit(X, y) assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh._r_i_list) assert sh._r_i_list == r_i_list +@pytest.mark.parametrize('klass', (HalvingRandomSearchCV, HalvingGridSearchCV)) @pytest.mark.parametrize( 'max_resources, n_iterations, n_possible_iterations', [ ('auto', 5, 9), # all resources are used @@ -198,7 +164,8 @@ def test_force_exhaust_resources_true(max_resources, r_i_list): (4, 1, 1), # max_resources == min_resources, only one iteration is # possible ]) -def test_n_iterations(max_resources, n_iterations, n_possible_iterations): +def test_n_iterations(klass, max_resources, n_iterations, + n_possible_iterations): # test the number of actual iterations that were run depending on # max_resources @@ -208,23 +175,26 @@ def test_n_iterations(max_resources, n_iterations, n_possible_iterations): base_estimator = FastClassifier() ratio = 2 - sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, ratio=ratio, - max_resources=max_resources, min_resources=4) + sh = klass(base_estimator, parameters, cv=2, ratio=ratio, + max_resources=max_resources, min_resources=4) + if klass is HalvingRandomSearchCV: + sh.set_params(n_candidates=20) # same as for HalvingGridSearchCV sh.fit(X, y) assert sh.n_required_iterations_ == 5 assert sh.n_iterations_ == n_iterations assert sh.n_possible_iterations_ == n_possible_iterations -def test_resource_parameter(): +@pytest.mark.parametrize('klass', (HalvingRandomSearchCV, HalvingGridSearchCV)) +def test_resource_parameter(klass): # Test the resource parameter n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() - sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, - resource='c', max_resources=10, ratio=3) + sh = klass(base_estimator, parameters, cv=2, resource='c', + max_resources=10, ratio=3) sh.fit(X, y) assert set(sh._r_i_list) == set([1, 3, 9]) for r_i, params, param_c in zip(sh.cv_results_['resource_iter'], @@ -258,8 +228,8 @@ def test_resource_parameter(): (32, 9, 9), # ask for more than 'reasonable' ]) def test_random_search(max_resources, n_candidates, expected_n_candidates_): - # Test random search and make sure the number of generated candidates is as - # expected + # Test random search and make sure the number of generated candidates is + # as expected n_samples = 1024 X, y = make_classification(n_samples=n_samples, random_state=0) @@ -277,10 +247,11 @@ def test_random_search(max_resources, n_candidates, expected_n_candidates_): assert sh._r_i_list[-1] == max_resources -def test_groups_not_supported(): +@pytest.mark.parametrize('klass', (HalvingRandomSearchCV, HalvingGridSearchCV)) +def test_groups_not_supported(klass): base_estimator = FastClassifier() param_grid = {'a': [1]} - sh = HalvingRandomSearchCV(base_estimator, param_grid) + sh = klass(base_estimator, param_grid) X, y = make_classification(n_samples=10) groups = [0] * 10 From b02c53e08e0e12567c25ce4fa5045b8331b5a897 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 11 Nov 2019 17:23:27 -0500 Subject: [PATCH 32/89] properly pass down fit_params --- sklearn/model_selection/_search.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index a1f1645a5cd6a..2bf6c67c47ab8 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -662,7 +662,6 @@ def fit(self, X, y=None, groups=None, **fit_params): pre_dispatch=self.pre_dispatch) fit_and_score_kwargs = dict(scorer=scorers, - fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, @@ -686,6 +685,7 @@ def evaluate_candidates(candidate_params, X, y, " totalling {2} fits".format( n_splits, n_candidates, n_candidates * n_splits)) + fit_and_score_kwargs['fit_params'] = fit_params out = parallel(delayed(_fit_and_score)(clone(base_estimator), X, y, train=train, test=test, @@ -1161,7 +1161,7 @@ def __init__(self, estimator, param_grid, scoring=None, def _run_search(self, evaluate_candidates, X, y, **fit_params): """Search all candidates in param_grid""" - evaluate_candidates(ParameterGrid(self.param_grid), X, y) + evaluate_candidates(ParameterGrid(self.param_grid), X, y, **fit_params) class RandomizedSearchCV(BaseSearchCV): @@ -1494,4 +1494,4 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): """Search n_iter candidates from param_distributions""" evaluate_candidates(ParameterSampler( self.param_distributions, self.n_iter, - random_state=self.random_state), X, y) + random_state=self.random_state), X, y, **fit_params) From 866c08ed973a8a566c6cfbad7d606e03d8a23511 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 11 Feb 2020 14:59:17 -0500 Subject: [PATCH 33/89] change default value of force_exhaust_resources and update doc --- doc/modules/grid_search.rst | 119 +++++++++--------- .../_search_successive_halving.py | 46 ++++--- .../tests/test_successive_halving.py | 16 ++- 3 files changed, 99 insertions(+), 82 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index c94c784f3314d..0d60414833501 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -182,9 +182,9 @@ search a parameter space using successive halving [1]_ [2]_. Successive halving is an iterative selection process where all candidates (the parameter combinations) are evaluated with a small amount of resources at the first iteration. Only some of these candidates are selected for the next -iteration, which will be allocated more resources. What defines a resource -is typically the number of samples to train on, or the number of iterations -in iterative algorithms like gradient boosting. +iteration, which will be allocated more resources. What defines a resource is +typically the number of samples to train on, but it can also be an arbitrary +numeric parameter such as `n_estimators` in a random forest. As illustrated in the figure below, only a small subset of candidates 'survive' until the last iteration. These are the candidates that have consistently been @@ -215,16 +215,17 @@ iteration for each candidate. The number of candidates is specified directly in :class:`HalvingRandomSearchCV`, and is determined from the ``param_grid`` parameter of :class:`HalvingGridSearchCV`. -Consider a case where we have 1000 samples. With ``min_resources=10`` and -``ratio=2`` we are able to run 7 iterations, with the following number of -samples: ``[10, 20, 40, 80, 160, 320, 640]``. +Consider a case where the resource is the number of samples, and where we +have 1000 samples. With ``min_resources=10`` and ``ratio=2`` we are able to +run 7 iterations, with the following number of samples: ``[10, 20, 40, 80, +160, 320, 640]``. If we start with a high number of candidates, we might end up with a lot of candidates at the last iteration. On the other hand if we start with a small number of candidates, the last iteration might use less than 640 samples which is a waste of resources. -In the case of :class:`HalvingGridSearchCV`, the number of candidates is set +In the case of :class:`HalvingRandomSearchCV`, the number of candidates is set by default such that the maximum amount of resources is used at the last iteration. @@ -235,23 +236,16 @@ candidates. Another consideration when choosing ``min_resources`` is whether or not it is easy to discriminate between good and bad candidates with a small amount of resources. For example, if you need a lot of samples to distinguish -between good and bad parameters, a high ``min_resources`` (possibly with the -use of ``aggressive_elimination=True``) is recommended. On the other hand if -the distinction is clear even with a small amount of samples, then a small -``min_resources`` may be preferable since it would speed up the computation. - -By default, ``min_resources`` is set to a small value (see docstrings for -details) that depends on the number of folds, and the number of classes for -classification problems. Depending on the setting, the default value of -``min_resources`` might not be ideal. - -.. note:: - Notice in the example above that the last iteration does not use the - maximum amount of resources available: 1000 samples are available, yet - only 640 are used. Using ``force_exhaust_resources=True`` will set - ``min_resources`` to a specific value such that the last iteration uses as - many samples as possible. Please see :ref:`exhausting_the_resources` for - details. +between good and bad parameters, a high ``min_resources`` is recommended. On +the other hand if the distinction is clear even with a small amount of +samples, then a small ``min_resources`` may be preferable since it would +speed up the computation. + +Notice in the example above that the last iteration does not use the maximum +amount of resources available: 1000 samples are available, yet only 640 are +used, at most. By default, ``min_resources`` is set to a specific value such +that the last iteration uses as many samples as possible. Please see +:ref:`exhausting_the_resources` for details. .. _amount_of_resource_and_number_of_candidates: @@ -282,21 +276,21 @@ is identified at the iteration that is evaluating `ratio` or less candidates Here is an example with ``min_resources=3`` and ``ratio=2``, starting with 70 candidates: -+-------------+-----------------------+ ++-----------------------+-----------------------+ | ``resource_iter`` | ``n_candidates_at_i`` | -+=============+=======================+ -| 3 (=min_resources) | 70 (=n_candidates) | -+-------------+-----------------------+ -| 3 * 2 = 6 | 70 // 2 = 35 | -+-------------+-----------------------+ -| 6 * 2 = 12 | 35 // 2 = 17 | -+-------------+-----------------------+ -| 12 * 2 = 24 | 17 // 2 = 8 | -+-------------+-----------------------+ -| 24 * 2 = 48 | 8 // 2 = 4 | -+-------------+-----------------------+ -| 48 * 2 = 96 | 4 // 2 = 2 | -+-------------+-----------------------+ ++=======================+=======================+ +| 3 (=min_resources) | 70 (=n_candidates) | ++-----------------------+-----------------------+ +| 3 * 2 = 6 | 70 // 2 = 35 | ++-----------------------+-----------------------+ +| 6 * 2 = 12 | 35 // 2 = 17 | ++-----------------------+-----------------------+ +| 12 * 2 = 24 | 17 // 2 = 8 | ++-----------------------+-----------------------+ +| 24 * 2 = 48 | 8 // 2 = 4 | ++-----------------------+-----------------------+ +| 48 * 2 = 96 | 4 // 2 = 2 | ++-----------------------+-----------------------+ We can note that: @@ -304,7 +298,8 @@ We can note that: candidates: the best candidate is the best out of these 2 candidates. It is not necessary to run an additional iteration, since it would only evaluate one candidate (namely the best one, which we have already - identified). + identified). For this reason, in general, we want the last iteration to run + at most `ratio` candidates. - each ``resource_iter`` is a multiple of both ``ratio`` and ``min_resources`` (which is confirmed by its definition above). @@ -313,13 +308,13 @@ the `cv_results_` attribute after converting it to a dataframe: `results.groupby('iter')['resource_iter'].unique()`, as done e.g. in :ref:`sphx_glr_auto_examples_model_selection_plot_successive_halving_iterations.py` -Choosing a resource to budget ------------------------------ +Choosing a resource +------------------- -By default, the budget is defined in terms of number of samples. That is, +By default, the resource is defined in terms of number of samples. That is, each iteration will use an increasing amount of samples to train on. You can -however manually specify a parameter to use as the budget with the -``resource`` parameter. Here is an example where the budget is defined in +however manually specify a parameter to use as the resource with the +``resource`` parameter. Here is an example where the resource is defined in terms of the number of estimators of a random forest:: >>> from sklearn.datasets import make_classification @@ -335,7 +330,7 @@ terms of the number of estimators of a random forest:: ... ratio=2, resource='n_estimators', ... max_resources=30, random_state=0).fit(X, y) >>> sh.best_estimator_ - RandomForestClassifier(max_depth=5, n_estimators=8, random_state=0) + RandomForestClassifier(max_depth=5, n_estimators=24, random_state=0) Note that it is not possible to budget on a parameter that is part of the parameter grid. @@ -345,9 +340,10 @@ parameter grid. Exhausting the available resources ---------------------------------- -As mentioned above, the first iteration uses ``min_resources`` resources. If -you have a lot of resources available, some of them might be wasted (not -used):: +As mentioned above, the number of resources that is used at each iteration +depends on the `min_resources` parameter. +If you have a lot of resources available but start with a low number of +resources, some of them might be wasted (i.e. not used):: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC @@ -358,7 +354,8 @@ used):: >>> base_estimator = SVC(gamma='scale') >>> X, y = make_classification(n_samples=1000) >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2).fit(X, y) + ... ratio=2, min_resources=20, + ... force_exhaust_resources=False).fit(X, y) >>> results = pd.DataFrame(sh.cv_results_) >>> results.groupby('iter')['resource_iter'].unique() iter @@ -368,14 +365,18 @@ used):: Name: resource_iter, dtype: object The search process will only use 80 resources at most, while our maximum -amount of available resources is ``n_samples=1000``. Note in this case that -``min_resources = r_0 = 20``. In order for the last iteration to use as many -resources as possible, you can use the ``force_exhaust_resources`` -parameter.:: +amount of available resources is ``n_samples=1000``. Here, we have +``min_resources = r_0 = 20``. + +By default, the `force_exhaust_resources` parameter is True and the +`min_resources` parameter is 'auto'. This means that `min_resources` is +automatically set such that the last iteration can use as many resources as +possible, within the `max_resources` limit:: >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2, force_exhaust_resources=True, - ... ).fit(X, y) + ... ratio=2, min_resources='auto', + ... force_exhaust_resources=True + ... ).fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) >>> results.groupby('iter')['resource_iter'].unique() iter @@ -385,9 +386,11 @@ parameter.:: Name: resource_iter, dtype: object `min_resources` was here automatically set to 250, which results in the last -iteration using all the resources. Since ``force_exhaust_resources`` chooses an -appropriate ``min_resources`` to start with, ``min_resources`` must be set -to 'auto' (default). +iteration using all the resources. In general, this leads to a better final +candidate parameter, and is slightly more time-intensive. + +Since ``force_exhaust_resources`` chooses an appropriate ``min_resources`` to +start with, ``min_resources`` must be set to 'auto' (which is the default). Aggressive elimination of candidates ------------------------------------ diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 5f6eb384d8636..aab4f9cf599d7 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -411,14 +411,14 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): The minimum amount of resource that any candidate is allowed to use for a given iteration. Equivalently, this defines the amount of resources that are allocated for each candidate at the first iteration. By - default, this is set to: + default, this is set to the highest possible value + satisfying the constraint `force_exhaust_resources=True` (which is + the default). Otherwise this is set to: - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a regression problem - - The highest possible value satisfying the constraint - ``force_exhaust_resources=True``. - ``1`` when ``resource!='n_samples'`` Note that the amount of resources used at each iteration is always a @@ -444,11 +444,16 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): ``False`` by default, which means that the last iteration may evaluate more than ``ratio`` candidates. - force_exhaust_resources : bool, default=False - If True, then ``min_resources`` is set to a specific value such that - the last iteration uses as much resources as possible. Namely, the - last iteration uses the highest value smaller than ``max_resources`` - that is a multiple of both ``min_resources`` and ``ratio``. + force_exhaust_resources : bool, default=True + When True, ``min_resources`` (which must be 'auto') is set to a + specific value such that the last iteration uses as much resources as + possible. Namely, the last iteration uses the highest value smaller + than ``max_resources`` that is a multiple of both ``min_resources`` + and ``ratio``. When False, the last iteration may not exhaust the + total number of resources, since the first iteration will rely on the + value passed as the `min_resource` parameter. In general, + `force_exhaust_resources=True` leads to a more accurate estimator, + but is slightly more time consuming. Attributes ---------- @@ -572,7 +577,7 @@ def __init__(self, estimator, param_grid, scoring=None, error_score=np.nan, return_train_score=True, max_resources='auto', min_resources='auto', resource='n_samples', ratio=3, aggressive_elimination=False, - force_exhaust_resources=False): + force_exhaust_resources=True): super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, pre_dispatch=pre_dispatch, @@ -698,14 +703,14 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): The minimum amount of resource that any candidate is allowed to use for a given iteration. Equivalently, this defines the amount of resources that are allocated for each candidate at the first iteration. By - default, this is set to: + default, this is set to the highest possible value + satisfying the constraint `force_exhaust_resources=True` (which is + the default). Otherwise this is set to: - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a regression problem - - The highest possible value satisfying the constraint - ``force_exhaust_resources=True``. - ``1`` when ``resource!='n_samples'`` Note that the amount of resources used at each iteration is always a @@ -731,11 +736,16 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): ``False`` by default, which means that the last iteration may evaluate more than ``ratio`` candidates. - force_exhaust_resources : bool, default=False - If True, then ``min_resources`` is set to a specific value such that - the last iteration uses as much resources as possible. Namely, the - last iteration uses the highest value smaller than ``max_resources`` - that is a multiple of both ``min_resources`` and ``ratio``. + force_exhaust_resources : bool, default=True + When True, ``min_resources`` (which must be 'auto') is set to a + specific value such that the last iteration uses as much resources as + possible. Namely, the last iteration uses the highest value smaller + than ``max_resources`` that is a multiple of both ``min_resources`` + and ``ratio``. When False, the last iteration may not exhaust the + total number of resources, since the first iteration will rely on the + value passed as the `min_resource` parameter. In general, + `force_exhaust_resources=True` leads to a more accurate estimator, + but is slightly more time consuming. Attributes ---------- @@ -861,7 +871,7 @@ def __init__(self, estimator, param_distributions, random_state=None, error_score=np.nan, return_train_score=True, max_resources='auto', min_resources='auto', resource='n_samples', ratio=3, - aggressive_elimination=False, force_exhaust_resources=False): + aggressive_elimination=False, force_exhaust_resources=True): super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, random_state=random_state, error_score=error_score, diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 7d955bd78852b..6b39f73501433 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -63,7 +63,8 @@ def test_aggressive_elimination( sh = klass(base_estimator, parameters, aggressive_elimination=aggressive_elimination, - max_resources=max_resources, ratio=3) + max_resources=max_resources, ratio=3, + force_exhaust_resources=False) if klass is HalvingRandomSearchCV: sh.set_params(n_candidates=2 * 30) # same number as with the grid @@ -98,8 +99,8 @@ def test_force_exhaust_resources_false( expected_n_required_iterations, expected_n_possible_iterations, expected_r_i_list): # Test the force_exhaust_resources parameter when it's false or ignored. - # This is the default case: we start at the beginning no matter what since - # we do not overwrite min_resources_ + # We start at the beginning no matter what since we do not overwrite + # min_resources_ n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': [1, 2], 'b': [1, 2, 3]} @@ -176,7 +177,8 @@ def test_n_iterations(klass, max_resources, n_iterations, ratio = 2 sh = klass(base_estimator, parameters, cv=2, ratio=ratio, - max_resources=max_resources, min_resources=4) + max_resources=max_resources, min_resources=4, + force_exhaust_resources=False) if klass is HalvingRandomSearchCV: sh.set_params(n_candidates=20) # same as for HalvingGridSearchCV sh.fit(X, y) @@ -238,7 +240,8 @@ def test_random_search(max_resources, n_candidates, expected_n_candidates_): sh = HalvingRandomSearchCV(base_estimator, parameters, n_candidates=n_candidates, cv=2, max_resources=max_resources, ratio=2, - min_resources=4) + min_resources=4, + force_exhaust_resources=False) sh.fit(X, y) assert sh.n_candidates_[0] == expected_n_candidates_ if n_candidates == 'auto': @@ -283,7 +286,8 @@ def test_groups_not_supported(klass): 'min_resources must be set to auto if '), ({'max_resources': 'auto', 'resource': 'b'}, "max_resources can only be 'auto' if resource='n_samples'"), - ({'min_resources': 15, 'max_resources': 14}, + ({'min_resources': 15, 'max_resources': 14, + 'force_exhaust_resources': False}, "min_resources_=15 is greater than max_resources_=14"), ]) def test_input_errors(klass, params, expected_error_message): From d7c4fd805df5450b92b10037d7508ca6721a26a7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 11 Feb 2020 15:28:18 -0500 Subject: [PATCH 34/89] should fix doc --- doc/modules/grid_search.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 0d60414833501..999bfe729cd86 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -191,7 +191,7 @@ until the last iteration. These are the candidates that have consistently been part of the best candidates across all iterations. Each iteration is allocated an increasing amount of resources, here the number of samples. -.. figure:: ../auto_examples/svm/images/sphx_glr_plot_successive_halving_iterations_001.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_successive_halving_iterations_001.png :target: ../auto_examples/model_selection/plot_successive_halving_iterations.html :align: center @@ -460,8 +460,8 @@ min_resources = 20``. .. _successive_halving_cv_results: -Analysing results with the cv_results_ attribute ------------------------------------------------- +Analysing results with the `cv_results_` attribute +-------------------------------------------------- The ``cv_results_`` attribute contains useful information for analysing the results of a search. It can be converted to a pandas dataframe with ``df = From 3d6d9528b22be9859470fd3f3b18b5c532124486 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 12 Feb 2020 09:58:08 -0500 Subject: [PATCH 35/89] Used check_fit_params --- .../_search_successive_halving.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index aab4f9cf599d7..dbe95759d1d9c 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -1,14 +1,13 @@ from math import ceil, floor, log from abc import abstractmethod -from collections import OrderedDict from numbers import Integral import numpy as np from ._search import _check_param_grid from ._search import BaseSearchCV from . import ParameterGrid, ParameterSampler -from ..utils import check_random_state -from ..utils.validation import _num_samples +from ..utils import check_random_state, _safe_indexing +from ..utils.validation import _num_samples, _check_fit_params from ..base import is_classifier from ._split import check_cv from ..utils import resample @@ -260,15 +259,13 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): if self.resource == 'n_samples': # Subsample X and y as well as fit_params stratify = y if is_classifier(self.estimator) else None - fit_params = OrderedDict(fit_params) - X_iter, y_iter, *fit_params_iter_list = resample( - X, y, *fit_params.values(), replace=False, - random_state=rng, stratify=stratify, - n_samples=resource_iter) - fit_params_iter = { - key: fit_params_iter_list[i] - for (i, key) in enumerate(fit_params.keys()) - } + indices = resample(np.arange(X.shape[0]), replace=False, + random_state=rng, stratify=stratify, + n_samples=resource_iter) + X_iter = _safe_indexing(X, indices) + y_iter = _safe_indexing(y, indices) + fit_params_iter = _check_fit_params(X, fit_params, indices) + else: # Need copy so that the resource_iter of next iteration does # not overwrite From cabef661ead1d37e0cd25b5cda66acf03dd35aea Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 12 Mar 2020 14:57:19 -0400 Subject: [PATCH 36/89] Update section about min_resources and number of candidates --- doc/modules/grid_search.rst | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 999bfe729cd86..68fd6c026a845 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -216,22 +216,28 @@ in :class:`HalvingRandomSearchCV`, and is determined from the ``param_grid`` parameter of :class:`HalvingGridSearchCV`. Consider a case where the resource is the number of samples, and where we -have 1000 samples. With ``min_resources=10`` and ``ratio=2`` we are able to -run 7 iterations, with the following number of samples: ``[10, 20, 40, 80, -160, 320, 640]``. - -If we start with a high number of candidates, we might end up with a lot of -candidates at the last iteration. On the other hand if we start with a small -number of candidates, the last iteration might use less than 640 samples -which is a waste of resources. +have 1000 samples. In theroy, with ``min_resources=10`` and ``ratio=2``, we +are able to run **at most** 7 iterations with the following number of +samples: ``[10, 20, 40, 80, 160, 320, 640]``. + +But depending on the number of candidates, we might run less than 7 +iterations: if we start with a **small** number of candidates, the last +iteration might use less than 640 samples, which is a waste of resources. For +example if we start with 5 candidates, we only need 2 iterations: 5 +candidates for the first iteration, then `5 // 2 = 2` candidates at the +second iteration, after which we know which candidate performs the best (so +we don't need a third one). We would only be using at most 20 samples which +is a waste since we have 1000 samples at our disposal. +On the other hand, if we start with a **high** number of candidates, we might +end up with a lot of candidates at the last iteration, which is not always +ideal. In the case of :class:`HalvingRandomSearchCV`, the number of candidates is set by default such that the maximum amount of resources is used at the last -iteration. - -Changing the value of ``min_resources`` will impact the number of possible -iterations, and as a result will also have an effect on the ideal number of -candidates. +iteration. For :class:`HalvingGridSearchCV`, the number of candidates is +determined by the `param_grid` parameter. Changing the value of +``min_resources`` will impact the number of possible iterations, and as a +result will also have an effect on the ideal number of candidates. Another consideration when choosing ``min_resources`` is whether or not it is easy to discriminate between good and bad candidates with a small amount From 9d9a5d6244878eaeff23c8b71c921ecbe1974538 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 12 Mar 2020 15:13:20 -0400 Subject: [PATCH 37/89] Clarified ratio section --- doc/modules/grid_search.rst | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 68fd6c026a845..a715edf4ef8db 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -258,17 +258,18 @@ that the last iteration uses as many samples as possible. Please see Amount of resource and number of candidates at each iteration ------------------------------------------------------------- -The amount of resources ``resource_iter`` (e.g. the number of samples) -allocated for each candidate at iteration ``i`` is controlled by the -parameters ``ratio`` and ``min_resources`` as follows:: +At any iteration `i`, each candidate is allocated a given amount of resources +which we denote `resource_iter`. This quantity is controlled by the +parameters ``ratio`` and ``min_resources`` as follows (`ratio` is strictly +greater than 1):: - resource_iter = ratio**i * min_resources + resource_iter = ratio**i * min_resources, -``min_resources`` is the amount of resources used at the first iteration and -``ratio`` defines the proportions of candidates that will be selected for -the next iteration (``ratio`` must be greater than 1):: +where ``min_resources`` is the amount of resources used at the first +iteration, and ``ratio (> 1)`` defines the proportions of candidates that +will be selected for the next iteration:: - n_candidates_to_keep = n_candidates_at_i // ratio + n_candidates_at_i+1 = n_candidates_at_i // ratio So in the first iteration, we use ``min_resources`` resources ``n_candidates`` times. In the second iteration, we use ``min_resources * @@ -277,7 +278,7 @@ multiplies the resources per candidate and divides the number of candidates. This process stops when the maximum amount of resource per candidate is reached, or when we have identified the best candidate. The best candidate is identified at the iteration that is evaluating `ratio` or less candidates -(see below). +(see just below for an explanation). Here is an example with ``min_resources=3`` and ``ratio=2``, starting with 70 candidates: @@ -304,8 +305,8 @@ We can note that: candidates: the best candidate is the best out of these 2 candidates. It is not necessary to run an additional iteration, since it would only evaluate one candidate (namely the best one, which we have already - identified). For this reason, in general, we want the last iteration to run - at most `ratio` candidates. + identified). For this reason, **in general, we want the last iteration to + run at most `ratio` candidates**. - each ``resource_iter`` is a multiple of both ``ratio`` and ``min_resources`` (which is confirmed by its definition above). From 0eace47134f612db4fb4ef6e799adee5e528845c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 12 Mar 2020 15:28:14 -0400 Subject: [PATCH 38/89] Use ~ to refer to classes --- .../plot_successive_halving_heatmap.py | 13 ++++++------- .../plot_successive_halving_iterations.py | 7 +++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index 9f9f8d2edad19..86c4c994f8de2 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -3,21 +3,20 @@ ===================================================== This example compares the parameter search performed by -:class:`HalvingGridSearchCV ` and -:class:`GridSearchCV `. +:class:`~sklearn.model_selection.HalvingGridSearchCV` and +:class:`~sklearn.model_selection.GridSearchCV`. The heatmap shows the mean test score of the parameter combinations for an SVC instance. The -:class:`HalvingGridSearchCV ` +:class:`~sklearn.model_selection.HalvingGridSearchCV` also shows the iteration at which the combinations where last used. The combinations marked as ``0`` were only evaluated at the first iteration, while the ones with ``5`` are the parameter combinations that are considered the best ones. -We can see that the :class:`HalvingGridSearchCV -` class is able to find -parameter combinations that are just as accurate as :class:`GridSearchCV -`, in much less time. +We can see that the :class:`~sklearn.model_selection.HalvingGridSearchCV` +class is able to find parameter combinations that are just as accurate as +:class:`~sklearn.model_selection.GridSearchCV`, in much less time. """ from time import time diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 02ab43d47ea43..6e770dc2973ad 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -3,10 +3,9 @@ ============================= This example illustrates how a successive halving search ( -:class:`HalvingGridSearchCV ` -and :class:`HalvingRandomSearchCV -`) selectively chooses the -best parameter combination out of multiple candidates. +:class:`~sklearn.model_selection.HalvingGridSearchCV` and +:class:`~sklearn.model_selection.HalvingRandomSearchCV`) selectively chooses +the best parameter combination out of multiple candidates. At the first iteration, a small amount of resources is used. The resource here is the number of samples that the estimators are trained on. All candidates are From 39bf2e2a8845a7d48a8e54e9a2ced760727826de Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 8 Jul 2020 12:27:43 -0400 Subject: [PATCH 39/89] fixed doc checks --- sklearn/model_selection/_search_successive_halving.py | 6 +++--- sklearn/tests/test_docstring_parameters.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 37b783d6d8624..147714dc7a6dd 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -35,7 +35,7 @@ class BaseSuccessiveHalving(BaseSearchCV): Almost optimal exploration in multi-armed bandits, ICML 13 Zohar Karnin, Tomer Koren, Oren Somekh """ - def __init__(self, estimator, scoring=None, + def __init__(self, estimator, *, scoring=None, n_jobs=None, refit=True, cv=5, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=True, @@ -568,7 +568,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): """ _required_parameters = ["estimator", "param_grid"] - def __init__(self, estimator, param_grid, scoring=None, + def __init__(self, estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, verbose=0, cv=5, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=True, @@ -862,7 +862,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): """ _required_parameters = ["estimator", "param_distributions"] - def __init__(self, estimator, param_distributions, + def __init__(self, estimator, param_distributions, *, n_candidates='auto', scoring=None, n_jobs=None, refit=True, verbose=0, cv=5, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index b0799232a0e0f..9b0749dc62571 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -186,7 +186,8 @@ def test_fit_docstring_attributes(name, Estimator): 'SelectFromModel', 'SparseCoder', 'SparseRandomProjection', 'SpectralBiclustering', 'StackingClassifier', 'StackingRegressor', 'TfidfVectorizer', 'VotingClassifier', - 'VotingRegressor'} + 'VotingRegressor', 'HalvingGridSearchCV', + 'HalvingRandomSearchCV'} if Estimator.__name__ in IGNORED or Estimator.__name__.startswith('_'): pytest.skip("Estimator cannot be fit easily to test fit attributes") From 1a0808e8c5eed75769da57a5b5498a7d857883a1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 9 Jul 2020 09:31:27 -0400 Subject: [PATCH 40/89] Apply suggestions from code review Co-authored-by: Joel Nothman --- sklearn/model_selection/_search_successive_halving.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 147714dc7a6dd..23dbd06d8d145 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -314,7 +314,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): amount of resources and iteratively selects the best candidates, using more and more resources. - Read more in the :ref:`User guide`. + Read more in the :ref:`User guide `. Parameters ---------- @@ -400,8 +400,8 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): yield the best generalization performance. max_resources : int, default='auto' - The maximum number of resources that any candidate is allowed to use - for a given iteration. By default, this is set ``n_samples`` when + The maximum amount of resource that any candidate is allowed to use + for a given iteration. By default, this is set to ``n_samples`` when ``resource='n_samples'`` (default), else an error is raised. min_resources : int, default='auto' @@ -416,7 +416,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): problem - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a regression problem - - ``1`` when ``resource!='n_samples'`` + - ``1`` when ``resource != 'n_samples'`` Note that the amount of resources used at each iteration is always a multiple of ``min_resources``. @@ -448,7 +448,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): than ``max_resources`` that is a multiple of both ``min_resources`` and ``ratio``. When False, the last iteration may not exhaust the total number of resources, since the first iteration will rely on the - value passed as the `min_resource` parameter. In general, + value passed as the `min_resources` parameter. In general, `force_exhaust_resources=True` leads to a more accurate estimator, but is slightly more time consuming. From d4d7d10ca714f950eb55ce53046983efca419043 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 9 Jul 2020 09:39:00 -0400 Subject: [PATCH 41/89] Addressed easy comments from Joel --- doc/modules/grid_search.rst | 3 ++- sklearn/model_selection/_search.py | 4 ++-- .../_search_successive_halving.py | 21 +++++++++++-------- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index a715edf4ef8db..00cbfaca7a18e 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -179,7 +179,8 @@ Searching optimal parameters with successive halving Scikit-learn also provides the :class:`HalvingGridSearchCV` and :class:`HalvingRandomSearchCV` estimators that can be used to search a parameter space using successive halving [1]_ [2]_. Successive -halving is an iterative selection process where all candidates (the +halving (SH) is a sort of tournament between candidate parameter combinations. +SH is an iterative selection process where all candidates (the parameter combinations) are evaluated with a small amount of resources at the first iteration. Only some of these candidates are selected for the next iteration, which will be allocated more resources. What defines a resource is diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 8ddee110817a2..c6da0a4245630 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -813,11 +813,11 @@ def evaluate_candidates(candidate_params, X, y, return self def _format_results(self, candidate_params, scorers, n_splits, out, - more_results={}): + more_results=None): n_candidates = len(candidate_params) out = _aggregate_score_dicts(out) - results = dict(more_results) + results = dict(more_results or {}) def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 147714dc7a6dd..c9f69e8df66ad 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -426,12 +426,13 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): the resource is the number of samples. It can also be set to any parameter of the base estimator that accepts positive integer values, e.g. 'n_iterations' or 'n_estimators' for a gradient - boosting estimator. In this case ``max_resources`` cannot be 'auto'. + boosting estimator. In this case ``max_resources`` cannot be 'auto' + and must be set explicitly. ratio : int or float, default=3 The 'halving' parameter, which determines the proportion of candidates - that are selected for the next iteration. For example, ``ratio=3`` - means that only one third of the candidates are selected. + that are selected for each subsequent iteration. For example, + ``ratio=3`` means that only one third of the candidates are selected. aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough resources to @@ -439,7 +440,8 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): the search process will 'replay' the first iteration for as long as needed until the number of candidates is small enough. This is ``False`` by default, which means that the last iteration may evaluate - more than ``ratio`` candidates. + more than ``ratio`` candidates. See :ref:aggressive_elimination` for + more details. force_exhaust_resources : bool, default=True When True, ``min_resources`` (which must be 'auto') is set to a @@ -497,7 +499,6 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): Please refer to the :ref:`User guide` for details. - best_estimator_ : estimator or dict Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) @@ -718,12 +719,13 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): the resource is the number of samples. It can also be set to any parameter of the base estimator that accepts positive integer values, e.g. 'n_iterations' or 'n_estimators' for a gradient - boosting estimator. In this case ``max_resources`` cannot be 'auto'. + boosting estimator. In this case ``max_resources`` cannot be 'auto' + and must be set explicitly. ratio : int or float, default=3 The 'halving' parameter, which determines the proportion of candidates - that are selected for the next iteration. For example, ``ratio=3`` - means that only one third of the candidates are selected. + that are selected for each subsequent iteration. For example, + ``ratio=3`` means that only one third of the candidates are selected. aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough resources to @@ -731,7 +733,8 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): the search process will 'replay' the first iteration for as long as needed until the number of candidates is small enough. This is ``False`` by default, which means that the last iteration may evaluate - more than ``ratio`` candidates. + more than ``ratio`` candidates. See :ref:aggressive_elimination` for + more details. force_exhaust_resources : bool, default=True When True, ``min_resources`` (which must be 'auto') is set to a From 2cffdc378d31f9dd83fd38dcd5400122fabee9e8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 9 Jul 2020 10:09:13 -0400 Subject: [PATCH 42/89] missed some --- doc/modules/grid_search.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 00cbfaca7a18e..fbade951c522d 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -400,6 +400,8 @@ candidate parameter, and is slightly more time-intensive. Since ``force_exhaust_resources`` chooses an appropriate ``min_resources`` to start with, ``min_resources`` must be set to 'auto' (which is the default). +.. _aggressive_elimination: + Aggressive elimination of candidates ------------------------------------ From 1403dfaecad33602e0f13f57b50141aa28a84a30 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 20 Jul 2020 09:58:40 -0400 Subject: [PATCH 43/89] updated docstring of run_search --- sklearn/model_selection/_search.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index c6da0a4245630..89b298031ac9e 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -621,13 +621,26 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): collected evaluation results. This makes it possible to implement Bayesian optimization or more generally sequential model-based optimization by deriving from the BaseSearchCV abstract base class. + For example, Succesive Halving is implemented by calling + `evaluate_candidates` multiples times (once per iteration of the SH + process), each time passing a different set of candidates with `X` + and `y` of increasing sizes. Parameters ---------- evaluate_candidates : callable - This callback accepts a list of candidates, where each candidate is - a dict of parameter settings. It returns a dict of all results so - far, formatted like ``cv_results_``. + This callback accepts: + - a list of candidates, where each candidate is a dict of + parameter settings. + - the samples `X` + - the targets `y` + - an optional `more_results` dict. Each key will be added to + the `cv_results_` attribute. Values should be lists of + length `n_candidates` + - a **fit_params keyword + + It returns a dict of all results so far, formatted like + ``cv_results_``. Examples -------- From 446666cfdc754c7b5f25087996b32426881efb70 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 20 Jul 2020 10:08:44 -0400 Subject: [PATCH 44/89] Used f strings instead of format --- .../_search_successive_halving.py | 41 +++++++++---------- .../tests/test_successive_halving.py | 3 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index bf613862b998d..8f9da06b7718c 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -72,9 +72,9 @@ def _check_input_parameters(self, X, y, groups): if (self.resource != 'n_samples' and self.resource not in self.estimator.get_params()): raise ValueError( - 'Cannot use resource={} which is not supported ' - 'by estimator {}'.format(self.resource, - self.estimator.__class__.__name__)) + f'Cannot use resource={self.resource} which is not supported ' + f'by estimator {self.estimator.__class__.__name__}' + ) if (isinstance(self.max_resources, str) and self.max_resources != 'auto'): @@ -133,8 +133,8 @@ def _check_input_parameters(self, X, y, groups): if self.min_resources_ > self.max_resources_: raise ValueError( - 'min_resources_={} is greater than max_resources_={}.' - .format(self.min_resources_, self.max_resources_) + f'min_resources_={self.min_resources_} is greater ' + f'than max_resources_={self.max_resources_}.' ) def fit(self, X, y=None, groups=None, **fit_params): @@ -180,8 +180,9 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): self.resource in candidate for candidate in candidate_params): # Can only check this now since we need the candidates list raise ValueError( - "Cannot use parameter {} as the resource since it is part of " - "the searched parameters.".format(self.resource)) + f"Cannot use parameter {self.resource} as the resource since " + "it is part of the searched parameters." + ) # n_required_iterations is the number of iterations needed so that the # last iterations evaluates less than `ratio` candidates. @@ -214,16 +215,14 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): n_iterations = min(n_possible_iterations, n_required_iterations) if self.verbose: - print('n_iterations: {}'.format(n_iterations)) - print('n_required_iterations: {}'.format(n_required_iterations)) - print('n_possible_iterations: {}'.format(n_possible_iterations)) - print('min_resources_: {}'.format(self.min_resources_)) - print('max_resources_: {}'.format(self.max_resources_)) - print('aggressive_elimination: {}'.format( - self.aggressive_elimination)) - print('force_exhaust_resources: {}'.format( - self.force_exhaust_resources)) - print('ratio: {}'.format(self.ratio)) + print(f'n_iterations: {n_iterations}') + print(f'n_required_iterations: {n_required_iterations}') + print(f'n_possible_iterations: {n_possible_iterations}') + print(f'min_resources_: {self.min_resources_}') + print(f'max_resources_: {self.max_resources_}') + print(f'aggressive_elimination: {self.aggressive_elimination}') + print(f'force_exhaust_resources: {self.force_exhaust_resources}') + print(f'ratio: {self.ratio}') # list of resource_iter for each iteration, used in tests self._r_i_list = [] @@ -252,9 +251,9 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): if self.verbose: print('-' * 10) - print('iter_i: {}'.format(iter_i)) - print('n_candidates: {}'.format(n_candidates)) - print('resource_iter: {}'.format(resource_iter)) + print(f'iter_i: {iter_i}') + print(f'n_candidates: {n_candidates}') + print(f'resource_iter: {resource_iter}') if self.resource == 'n_samples': # Subsample X and y as well as fit_params @@ -617,7 +616,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): method for sampling (such as those from scipy.stats.distributions). If a list is given, it is sampled uniformly. - n_candidates: int, default='auto' + n_candidates : int, default='auto' The number of candidate parameters to sample, at the first iteration. By default this will sample enough candidates so that the last iteration uses as many resources as possible. Note that diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 6b39f73501433..791ee681d0b58 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -64,7 +64,8 @@ def test_aggressive_elimination( sh = klass(base_estimator, parameters, aggressive_elimination=aggressive_elimination, max_resources=max_resources, ratio=3, - force_exhaust_resources=False) + force_exhaust_resources=False, + verbose=True) # just for test coverage if klass is HalvingRandomSearchCV: sh.set_params(n_candidates=2 * 30) # same number as with the grid From ed4f86ddcc8de61db9cfdba29e07acc729cb0d03 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 20 Jul 2020 10:32:42 -0400 Subject: [PATCH 45/89] remove candidate duplication checks --- sklearn/model_selection/_search_successive_halving.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 8f9da06b7718c..e8246f7197853 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -162,19 +162,19 @@ def fit(self, X, y=None, groups=None, **fit_params): y=y, groups=groups, ) + super().fit(X, y=y, groups=None, **fit_params) + # Set best_score_: BaseSearchCV does not set it, as refit is a callable self.best_score_ = ( self.cv_results_['mean_test_score'][self.best_index_]) + return self def _run_search(self, evaluate_candidates, X, y, **fit_params): rng = check_random_state(self.random_state) candidate_params = self._generate_candidate_params() - # Remove duplicates (may happen with random sampling) - candidate_params = set(tuple(d.items()) for d in candidate_params) - candidate_params = [dict(t) for t in candidate_params] if self.resource != 'n_samples' and any( self.resource in candidate for candidate in candidate_params): From c86be6d57dd12362dece053acf33635e028f868c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 20 Jul 2020 11:16:40 -0400 Subject: [PATCH 46/89] fix example --- examples/model_selection/plot_successive_halving_iterations.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 6e770dc2973ad..8b181e2bdf3eb 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -56,6 +56,7 @@ results = pd.DataFrame(rsh.cv_results_) results['params_str'] = results.params.apply(str) +results.drop_duplicates(subset=('params_str', 'iter'), inplace=True) mean_scores = results.pivot(index='iter', columns='params_str', values='mean_test_score') ax = mean_scores.plot(legend=False, alpha=.6) From 907ed9a7c1456a78d2edfe49b97d6f42b2527d6c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Jul 2020 09:57:23 -0400 Subject: [PATCH 47/89] Addressed easy comments --- doc/modules/grid_search.rst | 29 +++++++++++-------- .../plot_successive_halving_heatmap.py | 2 +- .../plot_successive_halving_iterations.py | 4 +-- .../_search_successive_halving.py | 26 ++++++++--------- 4 files changed, 33 insertions(+), 28 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index fbade951c522d..616b3bafedf82 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -173,13 +173,13 @@ variable that is log-uniformly distributed between ``1e0`` and ``1e3``:: .. _successive_halving_user_guide: -Searching optimal parameters with successive halving -==================================================== +Searching for optimal parameters with successive halving +======================================================== Scikit-learn also provides the :class:`HalvingGridSearchCV` and :class:`HalvingRandomSearchCV` estimators that can be used to search a parameter space using successive halving [1]_ [2]_. Successive -halving (SH) is a sort of tournament between candidate parameter combinations. +halving (SH) is like a tournament among candidate parameter combinations. SH is an iterative selection process where all candidates (the parameter combinations) are evaluated with a small amount of resources at the first iteration. Only some of these candidates are selected for the next @@ -187,18 +187,19 @@ iteration, which will be allocated more resources. What defines a resource is typically the number of samples to train on, but it can also be an arbitrary numeric parameter such as `n_estimators` in a random forest. -As illustrated in the figure below, only a small subset of candidates 'survive' -until the last iteration. These are the candidates that have consistently been -part of the best candidates across all iterations. Each iteration is allocated -an increasing amount of resources, here the number of samples. +As illustrated in the figure below, only a small subset of candidates +'survive' until the last iteration. These are the candidates that have +consistently ranked among the best candidates across all iterations. Each +iteration is allocated an increasing amount of resources per candidate, here +the number of samples. .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_successive_halving_iterations_001.png :target: ../auto_examples/model_selection/plot_successive_halving_iterations.html :align: center The ``ratio`` parameter controls the rate at which the resources grow, and -the rate at which the number of candidate decreases (more details in -:ref:`amount_of_resource_and_number_of_candidates`) +the rate at which the number of candidates decreases. More details in +:ref:`amount_of_resource_and_number_of_candidates`. .. topic:: Examples: @@ -217,7 +218,7 @@ in :class:`HalvingRandomSearchCV`, and is determined from the ``param_grid`` parameter of :class:`HalvingGridSearchCV`. Consider a case where the resource is the number of samples, and where we -have 1000 samples. In theroy, with ``min_resources=10`` and ``ratio=2``, we +have 1000 samples. In theory, with ``min_resources=10`` and ``ratio=2``, we are able to run **at most** 7 iterations with the following number of samples: ``[10, 20, 40, 80, 160, 320, 640]``. @@ -267,9 +268,13 @@ greater than 1):: resource_iter = ratio**i * min_resources, where ``min_resources`` is the amount of resources used at the first -iteration, and ``ratio (> 1)`` defines the proportions of candidates that +iteration. ``ratio (> 1)`` also defines the proportions of candidates that will be selected for the next iteration:: + n_candidates_iter = n_candidates // (ratio ** i) + +or equivalently:: + n_candidates_at_i+1 = n_candidates_at_i // ratio So in the first iteration, we use ``min_resources`` resources @@ -477,7 +482,7 @@ The ``cv_results_`` attribute contains useful information for analysing the results of a search. It can be converted to a pandas dataframe with ``df = pd.DataFrame(est.cv_results_)``. The ``cv_results_`` attribute of :class:`HalvingGridSearchCV` and :class:`HalvingRandomSearchCV` is similar -to that of :class:`GridSearchCV` and :class:`RandomizedSearchCV`>, with +to that of :class:`GridSearchCV` and :class:`RandomizedSearchCV`, with additional information related to the successive halving process. Here is an example with some of the columns of a (truncated) dataframe: diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index 86c4c994f8de2..292810681e1b0 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -104,7 +104,7 @@ def make_heatmap(ax, gs, show_iter=False, make_cbar=False): # Plot heatmaps and colorbar -fig, axes = plt.subplots(ncols=2) +fig, axes = plt.subplots(ncols=2, sharey=True) ax1, ax2 = axes make_heatmap(ax1, gsh, show_iter=True) diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 8b181e2bdf3eb..603e5f6b2263f 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -1,10 +1,10 @@ """ -Successive Halving Iterations +Successive halving Iterations ============================= This example illustrates how a successive halving search ( :class:`~sklearn.model_selection.HalvingGridSearchCV` and -:class:`~sklearn.model_selection.HalvingRandomSearchCV`) selectively chooses +:class:`~sklearn.model_selection.HalvingRandomSearchCV`) iteratively chooses the best parameter combination out of multiple candidates. At the first iteration, a small amount of resources is used. The resource here diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index e8246f7197853..2c6a7317f54eb 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -435,12 +435,12 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough resources to - eliminate enough candidates at the last iteration. If ``True``, then - the search process will 'replay' the first iteration for as long as - needed until the number of candidates is small enough. This is - ``False`` by default, which means that the last iteration may evaluate - more than ``ratio`` candidates. See :ref:aggressive_elimination` for - more details. + reduce the candidates to at most `ratio` in the last iteration. If + ``True``, then the search process will 'replay' the first iteration + for as long as needed until the number of candidates is small enough. + This is ``False`` by default, which means that the last iteration may + evaluate more than ``ratio`` candidates. See + :ref:`aggressive_elimination` for more details. force_exhaust_resources : bool, default=True When True, ``min_resources`` (which must be 'auto') is set to a @@ -728,12 +728,12 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough resources to - eliminate enough candidates at the last iteration. If ``True``, then - the search process will 'replay' the first iteration for as long as - needed until the number of candidates is small enough. This is - ``False`` by default, which means that the last iteration may evaluate - more than ``ratio`` candidates. See :ref:aggressive_elimination` for - more details. + reduce the candidates to at most `ratio` in the last iteration. If + ``True``, then the search process will 'replay' the first iteration + for as long as needed until the number of candidates is small enough. + This is ``False`` by default, which means that the last iteration may + evaluate more than ``ratio`` candidates. See + :ref:`aggressive_elimination` for more details. force_exhaust_resources : bool, default=True When True, ``min_resources`` (which must be 'auto') is set to a @@ -742,7 +742,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): than ``max_resources`` that is a multiple of both ``min_resources`` and ``ratio``. When False, the last iteration may not exhaust the total number of resources, since the first iteration will rely on the - value passed as the `min_resource` parameter. In general, + value passed as the `min_resources` parameter. In general, `force_exhaust_resources=True` leads to a more accurate estimator, but is slightly more time consuming. From dcb7f467063488d85cf786080e1a76f10f7be140 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Jul 2020 10:44:23 -0400 Subject: [PATCH 48/89] rotate ticks labels --- .../plot_successive_halving_iterations.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 603e5f6b2263f..dc403a462b934 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -62,10 +62,12 @@ ax = mean_scores.plot(legend=False, alpha=.6) r_i_list = results.groupby('iter')['resource_iter'].unique() -labels = ['{}\nn_samples={}\nn_candidates={}' - .format(i, r_i_list[i][0], rsh.n_candidates_[i]) - for i in range(rsh.n_iterations_)] -ax.set_xticklabels(labels) +labels = [ + f'iter={i}\nn_samples={r_i_list[i][0]}\n' + f'n_candidates={rsh.n_candidates_[i]}' + for i in range(rsh.n_iterations_) +] +ax.set_xticklabels(labels, rotation=45, multialignment ='left') ax.set_title('Scores of candidates over iterations') ax.set_ylabel('mean test score', fontsize=15) ax.set_xlabel('iterations', fontsize=15) From ac2368345f8532de3276b59e77d7f94ce6d87521 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 29 Jul 2020 10:52:05 -0400 Subject: [PATCH 49/89] Added discussion in the intro as suggested by Joel --- doc/modules/grid_search.rst | 14 ++++++++++++-- .../plot_successive_halving_iterations.py | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 616b3bafedf82..b16d7d8af4cfb 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -198,8 +198,18 @@ the number of samples. :align: center The ``ratio`` parameter controls the rate at which the resources grow, and -the rate at which the number of candidates decreases. More details in -:ref:`amount_of_resource_and_number_of_candidates`. +the rate at which the number of candidates decreases. In each iteration, the +number of resources per candidate is multiplied by ``ratio`` and the number +of candidates is divided by the same ratio. Along with ``resource`` and +``max_resources``, ``ratio`` is the most important parameter to control the +search in our implementation. ``ratio`` effectively controls the number of +iterations in :class:`HalvingGridSearchCV` and the number of candidates (if +'auto') and iterations in :class:`HalvingRandomSearchCV`. +``aggressive_elimination=True`` can also be used if the number of resources +is small but each evaluation on a large number of resources is expensive. +More control is available through tuning the ``min_resources`` parameter. +Each parameter and their interactions are described in more details below. + .. topic:: Examples: diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index dc403a462b934..f6b911d8c8a74 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -67,7 +67,7 @@ f'n_candidates={rsh.n_candidates_[i]}' for i in range(rsh.n_iterations_) ] -ax.set_xticklabels(labels, rotation=45, multialignment ='left') +ax.set_xticklabels(labels, rotation=45, multialignment='left') ax.set_title('Scores of candidates over iterations') ax.set_ylabel('mean test score', fontsize=15) ax.set_xlabel('iterations', fontsize=15) From 33b60d7aec1dc780515895d3ecb6bcf2c2627271 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 29 Jul 2020 11:17:39 -0400 Subject: [PATCH 50/89] Split examples into sections --- doc/modules/grid_search.rst | 1 - .../plot_successive_halving_heatmap.py | 35 +++++++++++-------- .../plot_successive_halving_iterations.py | 35 +++++++++++++------ 3 files changed, 45 insertions(+), 26 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index b16d7d8af4cfb..3cad4412d13c6 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -210,7 +210,6 @@ is small but each evaluation on a large number of resources is expensive. More control is available through tuning the ``min_resources`` parameter. Each parameter and their interactions are described in more details below. - .. topic:: Examples: * :ref:`sphx_glr_auto_examples_model_selection_plot_successive_halving_heatmap.py` diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index 292810681e1b0..32bc010ae3599 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -6,17 +6,6 @@ :class:`~sklearn.model_selection.HalvingGridSearchCV` and :class:`~sklearn.model_selection.GridSearchCV`. -The heatmap shows the mean test score of the parameter combinations for an -SVC instance. The -:class:`~sklearn.model_selection.HalvingGridSearchCV` -also shows the iteration at which the combinations where last used. The -combinations marked as ``0`` were only evaluated at the first iteration, while -the ones with ``5`` are the parameter combinations that are considered the -best ones. - -We can see that the :class:`~sklearn.model_selection.HalvingGridSearchCV` -class is able to find parameter combinations that are just as accurate as -:class:`~sklearn.model_selection.GridSearchCV`, in much less time. """ from time import time @@ -32,6 +21,12 @@ class is able to find parameter combinations that are just as accurate as print(__doc__) +# %% +# We first define the parameter space for an :class:`~sklearn.svm.SVC` +# estimator, and compute the time required to train a +# :class:`~sklearn.model_selection.HalvingGridSearchCV` instance, as well as a +# :class:`~sklearn.model_selection.GridSearchCV` instance. + rng = np.random.RandomState(0) X, y = datasets.make_classification(n_samples=1000, random_state=rng) @@ -41,7 +36,6 @@ class is able to find parameter combinations that are just as accurate as clf = SVC(random_state=rng) -# run HalvingGridSearchCV tic = time() gsh = HalvingGridSearchCV( estimator=clf, @@ -55,7 +49,6 @@ class is able to find parameter combinations that are just as accurate as gsh.fit(X, y) gsh_time = time() - tic -# run GridSearchCV tic = time() gs = GridSearchCV( estimator=clf, @@ -64,6 +57,9 @@ class is able to find parameter combinations that are just as accurate as gs.fit(X, y) gs_time = time() - tic +# %% +# We now plot heatmaps for both search estimators. + def make_heatmap(ax, gs, show_iter=False, make_cbar=False): """Helper to make a heatmap.""" @@ -103,7 +99,6 @@ def make_heatmap(ax, gs, show_iter=False, make_cbar=False): fontsize=15) -# Plot heatmaps and colorbar fig, axes = plt.subplots(ncols=2, sharey=True) ax1, ax2 = axes @@ -115,3 +110,15 @@ def make_heatmap(ax, gs, show_iter=False, make_cbar=False): ax2.set_title('GridSearch (time = {:.3f}s)'.format(gs_time), fontsize=15) plt.show() + +# %% +# The heatmaps show the mean test score of the parameter combinations for an +# :class:`~sklearn.svm.SVC` instance. The +# :class:`~sklearn.model_selection.HalvingGridSearchCV` also shows the +# iteration at which the combinations where last used. The combinations marked +# as ``0`` were only evaluated at the first iteration, while the ones with +# ``5`` are the parameter combinations that are considered the best ones. +# +# We can see that the :class:`~sklearn.model_selection.HalvingGridSearchCV` +# class is able to find parameter combinations that are just as accurate as +# :class:`~sklearn.model_selection.GridSearchCV`, in much less time. diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index f6b911d8c8a74..da4ce66f934ab 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -7,17 +7,6 @@ :class:`~sklearn.model_selection.HalvingRandomSearchCV`) iteratively chooses the best parameter combination out of multiple candidates. -At the first iteration, a small amount of resources is used. The resource here -is the number of samples that the estimators are trained on. All candidates are -evaluated. - -At the second iteration, only the best half of the candidates is evaluated. -The number of allocated resources is doubled: candidates are evaluated on -twice as many samples. - -This process is repeated until the last iteration, where only 2 candidates -are left. The best candidate is the candidate that has the best score at the -last iteration. """ import pandas as pd from sklearn import datasets @@ -31,6 +20,10 @@ print(__doc__) +# %% +# We first define the parameter space and train a +# :class:`~sklearn.model_selection.HalvingRandomSearchCV` instance. + rng = np.random.RandomState(0) X, y = datasets.make_classification(n_samples=700, random_state=rng) @@ -54,6 +47,10 @@ random_state=rng) rsh.fit(X, y) +# %% +# We can now use the `cv_results_` attribute of the search estimator to inspect +# and plot the evolution of the search. + results = pd.DataFrame(rsh.cv_results_) results['params_str'] = results.params.apply(str) results.drop_duplicates(subset=('params_str', 'iter'), inplace=True) @@ -72,3 +69,19 @@ ax.set_ylabel('mean test score', fontsize=15) ax.set_xlabel('iterations', fontsize=15) plt.show() + +# %% +# Number of candidates and amount of resource at each iteration +# ------------------------------------------------------------- +# +# At the first iteration, a small amount of resources is used. The resource +# here is the number of samples that the estimators are trained on. All +# candidates are evaluated. +# +# At the second iteration, only the best half of the candidates is evaluated. +# The number of allocated resources is doubled: candidates are evaluated on +# twice as many samples. +# +# This process is repeated until the last iteration, where only 2 candidates +# are left. The best candidate is the candidate that has the best score at the +# last iteration. From 762c889d99b27b71d099b7fb18324a7b075229f5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 29 Jul 2020 14:00:51 -0400 Subject: [PATCH 51/89] minor changes --- .../plot_successive_halving_heatmap.py | 4 +- .../tests/test_successive_halving.py | 50 +++++++++---------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index 32bc010ae3599..280a226fbd273 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -105,9 +105,9 @@ def make_heatmap(ax, gs, show_iter=False, make_cbar=False): make_heatmap(ax1, gsh, show_iter=True) make_heatmap(ax2, gs, make_cbar=True) -ax1.set_title('Successive Halving (time = {:.3f}s)'.format(gsh_time), +ax1.set_title('Successive Halving\ntime = {:.3f}s'.format(gsh_time), fontsize=15) -ax2.set_title('GridSearch (time = {:.3f}s)'.format(gs_time), fontsize=15) +ax2.set_title('GridSearch\ntime = {:.3f}s'.format(gs_time), fontsize=15) plt.show() diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 791ee681d0b58..bfa8344dc42e6 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -25,7 +25,7 @@ def get_params(self, deep=False): return params -@pytest.mark.parametrize('klass', (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV)) @pytest.mark.parametrize( ('aggressive_elimination,' 'max_resources,' @@ -46,7 +46,7 @@ def get_params(self, deep=False): ] ) def test_aggressive_elimination( - klass, aggressive_elimination, max_resources, expected_n_iterations, + Est, aggressive_elimination, max_resources, expected_n_iterations, expected_n_required_iterations, expected_n_possible_iterations, expected_n_remaining_candidates, expected_r_i_list): # Test the aggressive_elimination parameter. @@ -61,13 +61,13 @@ def test_aggressive_elimination( else: max_resources = n_samples - sh = klass(base_estimator, parameters, + sh = Est(base_estimator, parameters, aggressive_elimination=aggressive_elimination, max_resources=max_resources, ratio=3, force_exhaust_resources=False, verbose=True) # just for test coverage - if klass is HalvingRandomSearchCV: + if Est is HalvingRandomSearchCV: sh.set_params(n_candidates=2 * 30) # same number as with the grid sh.fit(X, y) @@ -79,7 +79,7 @@ def test_aggressive_elimination( assert sh.n_remaining_candidates_ == expected_n_remaining_candidates -@pytest.mark.parametrize('klass', (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV)) @pytest.mark.parametrize( ('min_resources,' 'max_resources,' @@ -96,7 +96,7 @@ def test_aggressive_elimination( ] ) def test_force_exhaust_resources_false( - klass, min_resources, max_resources, expected_n_iterations, + Est, min_resources, max_resources, expected_n_iterations, expected_n_required_iterations, expected_n_possible_iterations, expected_r_i_list): # Test the force_exhaust_resources parameter when it's false or ignored. @@ -107,10 +107,10 @@ def test_force_exhaust_resources_false( parameters = {'a': [1, 2], 'b': [1, 2, 3]} base_estimator = FastClassifier() - sh = klass(base_estimator, parameters, force_exhaust_resources=False, + sh = Est(base_estimator, parameters, force_exhaust_resources=False, ratio=3, min_resources=min_resources, max_resources=max_resources) - if klass is HalvingRandomSearchCV: + if Est is HalvingRandomSearchCV: sh.set_params(n_candidates=6) # same number as with the grid sh.fit(X, y) @@ -120,7 +120,7 @@ def test_force_exhaust_resources_false( assert sh._r_i_list == expected_r_i_list -@pytest.mark.parametrize('klass', (HalvingRandomSearchCV, HalvingGridSearchCV)) +@pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) @pytest.mark.parametrize('max_resources, r_i_list', [ ('auto', [333, 999]), (1000, [333, 999]), @@ -132,7 +132,7 @@ def test_force_exhaust_resources_false( (50, [20]), (20, [20]), ]) -def test_force_exhaust_resources_true(klass, max_resources, r_i_list): +def test_force_exhaust_resources_true(Est, max_resources, r_i_list): # Test the force_exhaust_resources parameter when it's true # in this case we need to change min_resources so that the last iteration # uses as much resources as possible @@ -142,9 +142,9 @@ def test_force_exhaust_resources_true(klass, max_resources, r_i_list): parameters = {'a': [1, 2], 'b': [1, 2, 3]} base_estimator = FastClassifier() - sh = klass(base_estimator, parameters, force_exhaust_resources=True, + sh = Est(base_estimator, parameters, force_exhaust_resources=True, ratio=3, max_resources=max_resources) - if klass is HalvingRandomSearchCV: + if Est is HalvingRandomSearchCV: sh.set_params(n_candidates=6) # same as for HalvingGridSearchCV sh.fit(X, y) @@ -152,7 +152,7 @@ def test_force_exhaust_resources_true(klass, max_resources, r_i_list): assert sh._r_i_list == r_i_list -@pytest.mark.parametrize('klass', (HalvingRandomSearchCV, HalvingGridSearchCV)) +@pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) @pytest.mark.parametrize( 'max_resources, n_iterations, n_possible_iterations', [ ('auto', 5, 9), # all resources are used @@ -166,7 +166,7 @@ def test_force_exhaust_resources_true(klass, max_resources, r_i_list): (4, 1, 1), # max_resources == min_resources, only one iteration is # possible ]) -def test_n_iterations(klass, max_resources, n_iterations, +def test_n_iterations(Est, max_resources, n_iterations, n_possible_iterations): # test the number of actual iterations that were run depending on # max_resources @@ -177,10 +177,10 @@ def test_n_iterations(klass, max_resources, n_iterations, base_estimator = FastClassifier() ratio = 2 - sh = klass(base_estimator, parameters, cv=2, ratio=ratio, + sh = Est(base_estimator, parameters, cv=2, ratio=ratio, max_resources=max_resources, min_resources=4, force_exhaust_resources=False) - if klass is HalvingRandomSearchCV: + if Est is HalvingRandomSearchCV: sh.set_params(n_candidates=20) # same as for HalvingGridSearchCV sh.fit(X, y) assert sh.n_required_iterations_ == 5 @@ -188,15 +188,15 @@ def test_n_iterations(klass, max_resources, n_iterations, assert sh.n_possible_iterations_ == n_possible_iterations -@pytest.mark.parametrize('klass', (HalvingRandomSearchCV, HalvingGridSearchCV)) -def test_resource_parameter(klass): +@pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) +def test_resource_parameter(Est): # Test the resource parameter n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() - sh = klass(base_estimator, parameters, cv=2, resource='c', + sh = Est(base_estimator, parameters, cv=2, resource='c', max_resources=10, ratio=3) sh.fit(X, y) assert set(sh._r_i_list) == set([1, 3, 9]) @@ -251,11 +251,11 @@ def test_random_search(max_resources, n_candidates, expected_n_candidates_): assert sh._r_i_list[-1] == max_resources -@pytest.mark.parametrize('klass', (HalvingRandomSearchCV, HalvingGridSearchCV)) -def test_groups_not_supported(klass): +@pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) +def test_groups_not_supported(Est): base_estimator = FastClassifier() param_grid = {'a': [1]} - sh = klass(base_estimator, param_grid) + sh = Est(base_estimator, param_grid) X, y = make_classification(n_samples=10) groups = [0] * 10 @@ -263,7 +263,7 @@ def test_groups_not_supported(klass): sh.fit(X, y, groups) -@pytest.mark.parametrize('klass', (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV)) @pytest.mark.parametrize('params, expected_error_message', [ ({'scoring': {'accuracy', 'accuracy'}}, 'Multimetric scoring is not supported'), @@ -291,12 +291,12 @@ def test_groups_not_supported(klass): 'force_exhaust_resources': False}, "min_resources_=15 is greater than max_resources_=14"), ]) -def test_input_errors(klass, params, expected_error_message): +def test_input_errors(Est, params, expected_error_message): base_estimator = FastClassifier() param_grid = {'a': [1]} X, y = make_classification(100) - sh = klass(base_estimator, param_grid, **params) + sh = Est(base_estimator, param_grid, **params) with pytest.raises(ValueError, match=expected_error_message): sh.fit(X, y) From f218a9c02611734136c4bdcb86a7d0b91b5ecd41 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 29 Jul 2020 17:19:24 -0400 Subject: [PATCH 52/89] remove force_exhaust_budget and introduce min_resources=exhaust --- doc/modules/grid_search.rst | 48 +++--- .../plot_successive_halving_heatmap.py | 1 - .../_search_successive_halving.py | 148 ++++++++---------- .../tests/test_successive_halving.py | 132 ++++++++-------- 4 files changed, 153 insertions(+), 176 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 3cad4412d13c6..06eb9a3dc39c9 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -201,14 +201,14 @@ The ``ratio`` parameter controls the rate at which the resources grow, and the rate at which the number of candidates decreases. In each iteration, the number of resources per candidate is multiplied by ``ratio`` and the number of candidates is divided by the same ratio. Along with ``resource`` and -``max_resources``, ``ratio`` is the most important parameter to control the +``min_resources``, ``ratio`` is the most important parameter to control the search in our implementation. ``ratio`` effectively controls the number of iterations in :class:`HalvingGridSearchCV` and the number of candidates (if 'auto') and iterations in :class:`HalvingRandomSearchCV`. -``aggressive_elimination=True`` can also be used if the number of resources -is small but each evaluation on a large number of resources is expensive. -More control is available through tuning the ``min_resources`` parameter. -Each parameter and their interactions are described in more details below. +``aggressive_elimination=True`` can also be used if the number of available +resources is small. More control is available through tuning the +``min_resources`` parameter. Each parameter and their interactions are +described in more details below. .. topic:: Examples: @@ -277,7 +277,7 @@ greater than 1):: resource_iter = ratio**i * min_resources, where ``min_resources`` is the amount of resources used at the first -iteration. ``ratio (> 1)`` also defines the proportions of candidates that +iteration. ``ratio`` also defines the proportions of candidates that will be selected for the next iteration:: n_candidates_iter = n_candidates // (ratio ** i) @@ -321,7 +321,7 @@ We can note that: is not necessary to run an additional iteration, since it would only evaluate one candidate (namely the best one, which we have already identified). For this reason, **in general, we want the last iteration to - run at most `ratio` candidates**. + run at most ``ratio`` candidates**. - each ``resource_iter`` is a multiple of both ``ratio`` and ``min_resources`` (which is confirmed by its definition above). @@ -376,8 +376,7 @@ resources, some of them might be wasted (i.e. not used):: >>> base_estimator = SVC(gamma='scale') >>> X, y = make_classification(n_samples=1000) >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2, min_resources=20, - ... force_exhaust_resources=False).fit(X, y) + ... ratio=2, min_resources=20).fit(X, y) >>> results = pd.DataFrame(sh.cv_results_) >>> results.groupby('iter')['resource_iter'].unique() iter @@ -390,15 +389,13 @@ The search process will only use 80 resources at most, while our maximum amount of available resources is ``n_samples=1000``. Here, we have ``min_resources = r_0 = 20``. -By default, the `force_exhaust_resources` parameter is True and the -`min_resources` parameter is 'auto'. This means that `min_resources` is -automatically set such that the last iteration can use as many resources as -possible, within the `max_resources` limit:: +For :class:`HalvingGridSearchCV`, by default, the `min_resources` parameter +is set to 'exhaust'. This means that `min_resources` is automatically set +such that the last iteration can use as many resources as possible, within +the `max_resources` limit:: >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2, min_resources='auto', - ... force_exhaust_resources=True - ... ).fit(X, y) + ... ratio=2, min_resources='exhaust').fit(X, y) >>> results = pd.DataFrame.from_dict(sh.cv_results_) >>> results.groupby('iter')['resource_iter'].unique() iter @@ -408,11 +405,22 @@ possible, within the `max_resources` limit:: Name: resource_iter, dtype: object `min_resources` was here automatically set to 250, which results in the last -iteration using all the resources. In general, this leads to a better final -candidate parameter, and is slightly more time-intensive. +iteration using all the resources. The exact value that is used depends on +the number of candidate parameter, on `max_resources` and on `ratio`. + +For :class:`HalvingRandomSearchCV`, exhausting the resources can be done in 2 +ways: + +- by setting `min_resources='exhaust'`, just like for + :class:`HalvingGridSearchCV`; +- by setting `n_candidates='exhaust'`. -Since ``force_exhaust_resources`` chooses an appropriate ``min_resources`` to -start with, ``min_resources`` must be set to 'auto' (which is the default). +Both options are mutally exclusive: using `min_resources='exhaust'` requires +knowing the number of candidates, and symmetrically `n_candidates='exhaust'` +requires knowing `min_resources`. + +In general, exhausting the total number of resources leads to a better final +candidate parameter, and is slightly more time-intensive. .. _aggressive_elimination: diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index 280a226fbd273..c46fdcc6f3197 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -42,7 +42,6 @@ param_grid=param_grid, resource='n_samples', max_resources='auto', # max_resources=n_samples - force_exhaust_resources=True, cv=5, ratio=2, random_state=rng) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 2c6a7317f54eb..bf474f37dc993 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -39,9 +39,8 @@ def __init__(self, estimator, *, scoring=None, n_jobs=None, refit=True, cv=5, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=True, - max_resources='auto', min_resources='auto', - resource='n_samples', ratio=3, aggressive_elimination=False, - force_exhaust_resources=False): + max_resources='auto', min_resources='exhaust', + resource='n_samples', ratio=3, aggressive_elimination=False): refit = _refit_callable if refit else False super().__init__(estimator, scoring=scoring, @@ -56,7 +55,6 @@ def __init__(self, estimator, *, scoring=None, self.ratio = ratio self.min_resources = min_resources self.aggressive_elimination = aggressive_elimination - self.force_exhaust_resources = force_exhaust_resources def _check_input_parameters(self, X, y, groups): @@ -88,28 +86,27 @@ def _check_input_parameters(self, X, y, groups): "max_resources must be either 'auto' or a positive integer" ) - if (isinstance(self.min_resources, str) and - self.min_resources != 'auto'): - raise ValueError( - "min_resources must be either 'auto' or a positive integer " - "no greater than max_resources." - ) - if self.min_resources != 'auto' and ( + if self.min_resources not in ('smallest', 'exhaust') and ( not isinstance(self.min_resources, Integral) or self.min_resources <= 0): raise ValueError( - "min_resources must be either 'auto' or a positive integer " + "min_resources must be either 'smallest', 'exhaust', " + "or a positive integer " "no greater than max_resources." ) - if self.force_exhaust_resources and self.min_resources != 'auto': + if (isinstance(self, HalvingRandomSearchCV) and + self.min_resources == self.n_candidates == 'exhaust'): + # for n_candidates=exhaust to work, we need to know what + # min_resources is. Similarly min_resources=exhaust needs to know + # the actual number of candidates. raise ValueError( - 'min_resources must be set to auto if force_exhaust_resources' - ' is True.' + "n_candidates and min_resources cannot be both set to " + "'exhaust'." ) self.min_resources_ = self.min_resources - if self.min_resources_ == 'auto': + if self.min_resources_ in ('smallest', 'exhaust'): if self.resource == 'n_samples': cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) @@ -123,6 +120,8 @@ def _check_input_parameters(self, X, y, groups): self.min_resources_ *= n_classes else: self.min_resources_ = 1 + # if 'exhaust', min_resources_ might be set to a higher value later + # in _run_search self.max_resources_ = self.max_resources if self.max_resources_ == 'auto': @@ -189,12 +188,10 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): n_required_iterations = 1 + floor(log(len(candidate_params), self.ratio)) - if self.force_exhaust_resources: + if self.min_resources == 'exhaust': # To exhaust the resources, we want to start with the biggest # min_resources possible so that the last (required) iteration # uses as many resources as possible - # We only force exhausting the resources if min_resources wasn't - # specified by the user. last_iteration = n_required_iterations - 1 self.min_resources_ = max( self.min_resources_, @@ -221,7 +218,6 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): print(f'min_resources_: {self.min_resources_}') print(f'max_resources_: {self.max_resources_}') print(f'aggressive_elimination: {self.aggressive_elimination}') - print(f'force_exhaust_resources: {self.force_exhaust_resources}') print(f'ratio: {self.ratio}') # list of resource_iter for each iteration, used in tests @@ -403,19 +399,24 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): for a given iteration. By default, this is set to ``n_samples`` when ``resource='n_samples'`` (default), else an error is raised. - min_resources : int, default='auto' - The minimum amount of resource that any candidate is allowed to use for - a given iteration. Equivalently, this defines the amount of resources - that are allocated for each candidate at the first iteration. By - default, this is set to the highest possible value - satisfying the constraint `force_exhaust_resources=True` (which is - the default). Otherwise this is set to: + min_resources : {'exhaust', 'smallest'} or int, default='exhaust' + The minimum amount of resource that any candidate is allowed to use + for a given iteration. Equivalently, this defines the amount of + resources `r0` that are allocated for each candidate at the first + iteration. - - ``n_splits * 2`` when ``resource='n_samples'`` for a regression - problem - - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a - regression problem - - ``1`` when ``resource != 'n_samples'`` + - 'smallest' is a heuristic that sets `r0` to a small value: + - ``n_splits * 2`` when ``resource='n_samples'`` for a regression + problem + - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a + regression problem + - ``1`` when ``resource != 'n_samples'`` + - 'exhaust' will set `r0` such that the **last** iteration uses as + much resources as possible. Namely, the last iteration will use the + highest value smaller than ``max_resources`` that is a multiple of + both ``min_resources`` and ``ratio``. In general, using 'exhaust' + leads to a more accurate estimator, but is slightly more time + consuming. Note that the amount of resources used at each iteration is always a multiple of ``min_resources``. @@ -442,17 +443,6 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): evaluate more than ``ratio`` candidates. See :ref:`aggressive_elimination` for more details. - force_exhaust_resources : bool, default=True - When True, ``min_resources`` (which must be 'auto') is set to a - specific value such that the last iteration uses as much resources as - possible. Namely, the last iteration uses the highest value smaller - than ``max_resources`` that is a multiple of both ``min_resources`` - and ``ratio``. When False, the last iteration may not exhaust the - total number of resources, since the first iteration will rely on the - value passed as the `min_resources` parameter. In general, - `force_exhaust_resources=True` leads to a more accurate estimator, - but is slightly more time consuming. - Attributes ---------- n_candidates_ : list of int @@ -543,7 +533,6 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): ... "min_samples_split": [5, 10]} >>> search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators', ... max_resources=10, - ... force_exhaust_resources=True, ... random_state=0).fit(X, y) >>> search.best_params_ # doctest: +SKIP {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} @@ -572,9 +561,8 @@ def __init__(self, estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, verbose=0, cv=5, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=True, - max_resources='auto', min_resources='auto', - resource='n_samples', ratio=3, aggressive_elimination=False, - force_exhaust_resources=True): + max_resources='auto', min_resources='exhaust', + resource='n_samples', ratio=3, aggressive_elimination=False): super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, pre_dispatch=pre_dispatch, @@ -582,8 +570,7 @@ def __init__(self, estimator, param_grid, *, scoring=None, return_train_score=return_train_score, max_resources=max_resources, resource=resource, ratio=ratio, min_resources=min_resources, - aggressive_elimination=aggressive_elimination, - force_exhaust_resources=force_exhaust_resources) + aggressive_elimination=aggressive_elimination) self.param_grid = param_grid _check_param_grid(self.param_grid) @@ -616,11 +603,12 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): method for sampling (such as those from scipy.stats.distributions). If a list is given, it is sampled uniformly. - n_candidates : int, default='auto' + n_candidates : int, default='exhaust' The number of candidate parameters to sample, at the first - iteration. By default this will sample enough candidates so that the - last iteration uses as many resources as possible. Note that - ``force_exhaust_resources`` has no effect in this case. + iteration. Using 'exhaust' will sample enough candidates so that the + last iteration uses as many resources as possible, based on + `min_resources`, `max_resources` and `ratio`. In this case, + `min_resources` cannot be 'exhaust'. scoring : string, callable, or None, default=None A single string (see :ref:`scoring_parameter`) or a callable @@ -696,19 +684,24 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): for a given iteration. By default, this is set ``n_samples`` when ``resource='n_samples'`` (default), else an error is raised. - min_resources : int, default='auto' - The minimum amount of resource that any candidate is allowed to use for - a given iteration. Equivalently, this defines the amount of resources - that are allocated for each candidate at the first iteration. By - default, this is set to the highest possible value - satisfying the constraint `force_exhaust_resources=True` (which is - the default). Otherwise this is set to: + min_resources : {'exhaust', 'smallest'} or int, default='smallest' + The minimum amount of resource that any candidate is allowed to use + for a given iteration. Equivalently, this defines the amount of + resources `r0` that are allocated for each candidate at the first + iteration. - - ``n_splits * 2`` when ``resource='n_samples'`` for a regression - problem - - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a - regression problem - - ``1`` when ``resource!='n_samples'`` + - 'smallest' is a heuristic that sets `r0` to a small value: + - ``n_splits * 2`` when ``resource='n_samples'`` for a regression + problem + - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a + regression problem + - ``1`` when ``resource != 'n_samples'`` + - 'exhaust' will set `r0` such that the **last** iteration uses as + much resources as possible. Namely, the last iteration will use the + highest value smaller than ``max_resources`` that is a multiple of + both ``min_resources`` and ``ratio``. In general, using 'exhaust' + leads to a more accurate estimator, but is slightly more time + consuming. 'exhaust' isn't available when `n_candidates='exhaust'`. Note that the amount of resources used at each iteration is always a multiple of ``min_resources``. @@ -735,17 +728,6 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): evaluate more than ``ratio`` candidates. See :ref:`aggressive_elimination` for more details. - force_exhaust_resources : bool, default=True - When True, ``min_resources`` (which must be 'auto') is set to a - specific value such that the last iteration uses as much resources as - possible. Namely, the last iteration uses the highest value smaller - than ``max_resources`` that is a multiple of both ``min_resources`` - and ``ratio``. When False, the last iteration may not exhaust the - total number of resources, since the first iteration will rely on the - value passed as the `min_resources` parameter. In general, - `force_exhaust_resources=True` leads to a more accurate estimator, - but is slightly more time consuming. - Attributes ---------- n_candidates_ : list of int @@ -839,7 +821,6 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): >>> search = HalvingRandomSearchCV(clf, param_distributions, ... resource='n_estimators', ... max_resources=10, - ... force_exhaust_resources=True, ... random_state=0).fit(X, y) >>> search.best_params_ # doctest: +SKIP {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} @@ -865,26 +846,25 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): _required_parameters = ["estimator", "param_distributions"] def __init__(self, estimator, param_distributions, *, - n_candidates='auto', scoring=None, n_jobs=None, refit=True, - verbose=0, cv=5, pre_dispatch='2*n_jobs', + n_candidates='exhaust', scoring=None, n_jobs=None, + refit=True, verbose=0, cv=5, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=True, max_resources='auto', - min_resources='auto', resource='n_samples', ratio=3, - aggressive_elimination=False, force_exhaust_resources=True): + min_resources='smallest', resource='n_samples', ratio=3, + aggressive_elimination=False): super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, random_state=random_state, error_score=error_score, return_train_score=return_train_score, max_resources=max_resources, resource=resource, ratio=ratio, min_resources=min_resources, - aggressive_elimination=aggressive_elimination, - force_exhaust_resources=force_exhaust_resources) + aggressive_elimination=aggressive_elimination) self.param_distributions = param_distributions self.n_candidates = n_candidates def _generate_candidate_params(self): n_candidates_first_iter = self.n_candidates - if n_candidates_first_iter == 'auto': + if n_candidates_first_iter == 'exhaust': # This will generate enough candidate so that the last iteration # uses as much resources as possible n_candidates_first_iter = ( diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index bfa8344dc42e6..aee6f8684f4c8 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -35,14 +35,15 @@ def get_params(self, deep=False): 'expected_n_remaining_candidates,' 'expected_r_i_list,'), [ # notice how it loops at the beginning - (True, 'small', 4, 4, 3, 1, [20, 20, 60, 180]), + (True, 'limited', 4, 4, 3, 1, [20, 20, 60, 180]), # no aggressive elimination: we end up with less iterations and more # candidates at the end - (False, 'small', 3, 4, 3, 3, [20, 60, 180]), + (False, 'limited', 3, 4, 3, 3, [20, 60, 180]), # When the amount of resource isn't limited, aggressive_elimination - # doesn't matter. - (True, 'high', 4, 4, 4, 1, [20, 60, 180, 540]), - (False, 'high', 4, 4, 4, 1, [20, 60, 180, 540]), + # has no effect. Here the default min_resources='exhaust' will take + # over. + (True, 'unlimited', 4, 4, 4, 1, [37, 111, 333, 999]), + (False, 'unlimited', 4, 4, 4, 1, [37, 111, 333, 999]), ] ) def test_aggressive_elimination( @@ -56,19 +57,19 @@ def test_aggressive_elimination( parameters = {'a': ('l1', 'l2'), 'b': list(range(30))} base_estimator = FastClassifier() - if max_resources == 'small': + if max_resources == 'limited': max_resources = 180 else: max_resources = n_samples sh = Est(base_estimator, parameters, - aggressive_elimination=aggressive_elimination, - max_resources=max_resources, ratio=3, - force_exhaust_resources=False, - verbose=True) # just for test coverage + aggressive_elimination=aggressive_elimination, + max_resources=max_resources, ratio=3, + verbose=True) # just for test coverage if Est is HalvingRandomSearchCV: - sh.set_params(n_candidates=2 * 30) # same number as with the grid + # same number of candidates as with the grid + sh.set_params(n_candidates=2 * 30, min_resources='exhaust') sh.fit(X, y) @@ -84,72 +85,52 @@ def test_aggressive_elimination( ('min_resources,' 'max_resources,' 'expected_n_iterations,' - 'expected_n_required_iterations,' 'expected_n_possible_iterations,' 'expected_r_i_list,'), [ # with enough resources - ('auto', 'auto', 2, 2, 4, [20, 60]), - # with enough resources but min_resources!='auto': ignored - (50, 'auto', 2, 2, 3, [50, 150]), - # without enough resources (resources are exhausted anyway) - ('auto', 30, 1, 2, 1, [20]), + ('smallest', 'auto', 2, 4, [20, 60]), + # with enough resources but min_resources set manually + (50, 'auto', 2, 3, [50, 150]), + # without enough resources, only one iteration can be done + ('smallest', 30, 1, 1, [20]), + # with exhaust: use as much resources as possible at the last iter + ('exhaust', 'auto', 2, 2, [333, 999]), + ('exhaust', 1000, 2, 2, [333, 999]), + ('exhaust', 999, 2, 2, [333, 999]), + ('exhaust', 600, 2, 2, [200, 600]), + ('exhaust', 599, 2, 2, [199, 597]), + ('exhaust', 300, 2, 2, [100, 300]), + ('exhaust', 60, 2, 2, [20, 60]), + ('exhaust', 50, 1, 1, [20]), + ('exhaust', 20, 1, 1, [20]), ] ) -def test_force_exhaust_resources_false( +def test_min_max_resources( Est, min_resources, max_resources, expected_n_iterations, - expected_n_required_iterations, expected_n_possible_iterations, + expected_n_possible_iterations, expected_r_i_list): - # Test the force_exhaust_resources parameter when it's false or ignored. - # We start at the beginning no matter what since we do not overwrite - # min_resources_ + # Test the min_resources and max_resources parameters, and how they affect + # the number of resources used at each iteration n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) parameters = {'a': [1, 2], 'b': [1, 2, 3]} base_estimator = FastClassifier() - sh = Est(base_estimator, parameters, force_exhaust_resources=False, - ratio=3, min_resources=min_resources, - max_resources=max_resources) + sh = Est(base_estimator, parameters, ratio=3, min_resources=min_resources, + max_resources=max_resources) if Est is HalvingRandomSearchCV: sh.set_params(n_candidates=6) # same number as with the grid sh.fit(X, y) + + expected_n_required_iterations = 2 # given 6 combinations and ratio = 3 assert sh.n_iterations_ == expected_n_iterations assert sh.n_required_iterations_ == expected_n_required_iterations assert sh.n_possible_iterations_ == expected_n_possible_iterations assert sh._r_i_list == expected_r_i_list - - -@pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) -@pytest.mark.parametrize('max_resources, r_i_list', [ - ('auto', [333, 999]), - (1000, [333, 999]), - (999, [333, 999]), - (600, [200, 600]), - (599, [199, 597]), - (300, [100, 300]), - (60, [20, 60]), - (50, [20]), - (20, [20]), -]) -def test_force_exhaust_resources_true(Est, max_resources, r_i_list): - # Test the force_exhaust_resources parameter when it's true - # in this case we need to change min_resources so that the last iteration - # uses as much resources as possible - - n_samples = 1000 - X, y = make_classification(n_samples=n_samples, random_state=0) - parameters = {'a': [1, 2], 'b': [1, 2, 3]} - base_estimator = FastClassifier() - - sh = Est(base_estimator, parameters, force_exhaust_resources=True, - ratio=3, max_resources=max_resources) - if Est is HalvingRandomSearchCV: - sh.set_params(n_candidates=6) # same as for HalvingGridSearchCV - sh.fit(X, y) - - assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh._r_i_list) - assert sh._r_i_list == r_i_list + if min_resources == 'exhaust': + assert (sh.n_possible_iterations_ == sh.n_iterations_ == + len(sh._r_i_list)) @pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) @@ -178,8 +159,7 @@ def test_n_iterations(Est, max_resources, n_iterations, ratio = 2 sh = Est(base_estimator, parameters, cv=2, ratio=ratio, - max_resources=max_resources, min_resources=4, - force_exhaust_resources=False) + max_resources=max_resources, min_resources=4) if Est is HalvingRandomSearchCV: sh.set_params(n_candidates=20) # same as for HalvingGridSearchCV sh.fit(X, y) @@ -197,7 +177,7 @@ def test_resource_parameter(Est): parameters = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() sh = Est(base_estimator, parameters, cv=2, resource='c', - max_resources=10, ratio=3) + max_resources=10, ratio=3) sh.fit(X, y) assert set(sh._r_i_list) == set([1, 3, 9]) for r_i, params, param_c in zip(sh.cv_results_['resource_iter'], @@ -224,8 +204,8 @@ def test_resource_parameter(Est): @pytest.mark.parametrize( 'max_resources, n_candidates, expected_n_candidates_', [ - (512, 'auto', 128), # generate exactly as much as needed - (32, 'auto', 8), + (512, 'exhaust', 128), # generate exactly as much as needed + (32, 'exhaust', 8), (32, 8, 8), (32, 7, 7), # ask for less than what we could (32, 9, 9), # ask for more than 'reasonable' @@ -241,12 +221,11 @@ def test_random_search(max_resources, n_candidates, expected_n_candidates_): sh = HalvingRandomSearchCV(base_estimator, parameters, n_candidates=n_candidates, cv=2, max_resources=max_resources, ratio=2, - min_resources=4, - force_exhaust_resources=False) + min_resources=4) sh.fit(X, y) assert sh.n_candidates_[0] == expected_n_candidates_ - if n_candidates == 'auto': - # Make sure 'auto' makes the last iteration use as much resources as + if n_candidates == 'exhaust': + # Make sure 'exhaust' makes the last iteration use as much resources as # we can assert sh._r_i_list[-1] == max_resources @@ -277,18 +256,15 @@ def test_groups_not_supported(Est): 'max_resources must be either'), ({'max_resources': -10}, 'max_resources must be either'), - ({'min_resources': 'not_auto'}, + ({'min_resources': 'bad str'}, 'min_resources must be either'), ({'min_resources': 0.5}, 'min_resources must be either'), ({'min_resources': -10}, 'min_resources must be either'), - ({'force_exhaust_resources': True, 'min_resources': 5}, - 'min_resources must be set to auto if '), ({'max_resources': 'auto', 'resource': 'b'}, "max_resources can only be 'auto' if resource='n_samples'"), - ({'min_resources': 15, 'max_resources': 14, - 'force_exhaust_resources': False}, + ({'min_resources': 15, 'max_resources': 14}, "min_resources_=15 is greater than max_resources_=14"), ]) def test_input_errors(Est, params, expected_error_message): @@ -300,3 +276,17 @@ def test_input_errors(Est, params, expected_error_message): with pytest.raises(ValueError, match=expected_error_message): sh.fit(X, y) + + +def test_n_candidates_min_resources_exhaust(): + # Make sure n_candidates and min_resources cannot be both exhaust + + base_estimator = FastClassifier() + param_grid = {'a': [1]} + X, y = make_classification(100) + + sh = HalvingRandomSearchCV(base_estimator, param_grid, + n_candidates='exhaust', min_resources='exhaust') + + with pytest.raises(ValueError, match="cannot be both set to 'exhaust'"): + sh.fit(X, y) From c19f989993b2474ce532532f022ff9fc1c823bf0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 29 Jul 2020 17:43:05 -0400 Subject: [PATCH 53/89] some minor validation --- .../plot_successive_halving_iterations.py | 2 +- .../_search_successive_halving.py | 25 ++++++++++++------- .../tests/test_successive_halving.py | 15 +++++++---- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index da4ce66f934ab..ae76df4ca6aca 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -41,7 +41,7 @@ param_distributions=param_dist, resource='n_samples', max_resources='auto', # max_resources=n_samples - n_candidates='auto', # choose n_cdts so that last iter exhausts resources + n_candidates='exhaust', cv=5, ratio=2, random_state=rng) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index bf474f37dc993..acef3384082d4 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -95,15 +95,22 @@ def _check_input_parameters(self, X, y, groups): "no greater than max_resources." ) - if (isinstance(self, HalvingRandomSearchCV) and - self.min_resources == self.n_candidates == 'exhaust'): - # for n_candidates=exhaust to work, we need to know what - # min_resources is. Similarly min_resources=exhaust needs to know - # the actual number of candidates. - raise ValueError( - "n_candidates and min_resources cannot be both set to " - "'exhaust'." - ) + if isinstance(self, HalvingRandomSearchCV): + if self.min_resources == self.n_candidates == 'exhaust': + # for n_candidates=exhaust to work, we need to know what + # min_resources is. Similarly min_resources=exhaust needs to + # know the actual number of candidates. + raise ValueError( + "n_candidates and min_resources cannot be both set to " + "'exhaust'." + ) + if self.n_candidates != 'exhaust' and ( + not isinstance(self.n_candidates, Integral) or + self.n_candidates<= 0): + raise ValueError( + "n_candidates must be either 'exhaust' " + "or a positive integer" + ) self.min_resources_ = self.min_resources if self.min_resources_ in ('smallest', 'exhaust'): diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index aee6f8684f4c8..64e9be6cf6e11 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -278,15 +278,20 @@ def test_input_errors(Est, params, expected_error_message): sh.fit(X, y) -def test_n_candidates_min_resources_exhaust(): - # Make sure n_candidates and min_resources cannot be both exhaust +@pytest.mark.parametrize('params, expected_error_message', [ + ({'n_candidates': 'exhaust', 'min_resources': 'exhaust'}, + "cannot be both set to 'exhaust'"), + ({'n_candidates': 'bad'}, "either 'exhaust' or a positive integer"), + ({'n_candidates': 0}, "either 'exhaust' or a positive integer"), +]) +def test_input_errors_randomized(params, expected_error_message): + # tests specific to HalvingRandomSearchCV base_estimator = FastClassifier() param_grid = {'a': [1]} X, y = make_classification(100) - sh = HalvingRandomSearchCV(base_estimator, param_grid, - n_candidates='exhaust', min_resources='exhaust') + sh = HalvingRandomSearchCV(base_estimator, param_grid, **params) - with pytest.raises(ValueError, match="cannot be both set to 'exhaust'"): + with pytest.raises(ValueError, match=expected_error_message): sh.fit(X, y) From a49acc370b6fe67e50a19493bc31dc56e00189e7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 29 Jul 2020 17:57:20 -0400 Subject: [PATCH 54/89] Added a n_resources_ attribute --- doc/modules/grid_search.rst | 56 +++++-------------- .../plot_successive_halving_iterations.py | 3 +- .../_search_successive_halving.py | 13 +++-- .../tests/test_successive_halving.py | 25 ++++----- 4 files changed, 36 insertions(+), 61 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 06eb9a3dc39c9..4d71d7059f7f8 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -325,10 +325,8 @@ We can note that: - each ``resource_iter`` is a multiple of both ``ratio`` and ``min_resources`` (which is confirmed by its definition above). -The amount of resources that is used at each iteration can be found using -the `cv_results_` attribute after converting it to a dataframe: -`results.groupby('iter')['resource_iter'].unique()`, as done e.g. in -:ref:`sphx_glr_auto_examples_model_selection_plot_successive_halving_iterations.py` +The amount of resources that is used at each iteration can be found in the +`n_resources_` attribute. Choosing a resource ------------------- @@ -377,13 +375,8 @@ resources, some of them might be wasted (i.e. not used):: >>> X, y = make_classification(n_samples=1000) >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, ... ratio=2, min_resources=20).fit(X, y) - >>> results = pd.DataFrame(sh.cv_results_) - >>> results.groupby('iter')['resource_iter'].unique() - iter - 0 [20] - 1 [40] - 2 [80] - Name: resource_iter, dtype: object + >>> sh.n_resources_ + [20, 40, 80] The search process will only use 80 resources at most, while our maximum amount of available resources is ``n_samples=1000``. Here, we have @@ -396,13 +389,8 @@ the `max_resources` limit:: >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, ... ratio=2, min_resources='exhaust').fit(X, y) - >>> results = pd.DataFrame.from_dict(sh.cv_results_) - >>> results.groupby('iter')['resource_iter'].unique() - iter - 0 [250] - 1 [500] - 2 [1000] - Name: resource_iter, dtype: object + >>> sh.n_resources_ + [250, 500, 1000] `min_resources` was here automatically set to 250, which results in the last iteration using all the resources. The exact value that is used depends on @@ -446,17 +434,10 @@ more than ``ratio`` candidates:: >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, ... ratio=2, max_resources=40, ... aggressive_elimination=False).fit(X, y) - >>> results = pd.DataFrame.from_dict(sh.cv_results_) - >>> results.groupby('iter').resource_iter.unique() - iter - 0 [20] - 1 [40] - Name: resource_iter, dtype: object - >>> results.groupby('iter').resource_iter.count() # number of candidates used at each iteration - iter - 0 6 - 1 3 - Name: resource_iter, dtype: int64 + >>> sh.n_resources_ + [20, 40] + >>> sh.n_candidates_ + [6, 3] Since we cannot use more than ``max_resources=40`` resources, the process has to stop at the second iteration which evaluates more than ``ratio=2`` @@ -472,19 +453,10 @@ necessary using ``min_resources`` resources:: ... max_resources=40, ... aggressive_elimination=True, ... ).fit(X, y) - >>> results = pd.DataFrame.from_dict(sh.cv_results_) - >>> results.groupby('iter').resource_iter.unique() - iter - 0 [20] - 1 [20] - 2 [40] - Name: resource_iter, dtype: object - >>> results.groupby('iter').resource_iter.count() # number of candidates used at each iteration - iter - 0 6 - 1 3 - 2 2 - Name: resource_iter, dtype: int64 + >>> sh.n_resources_ + [20, 20, 40] + >>> sh.n_candidates_ + [6, 3, 2] Notice that we end with 2 candidates at the last iteration since we have eliminated enough candidates during the first iterations, using ``resource_iter = diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index ae76df4ca6aca..638eabc434131 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -58,9 +58,8 @@ values='mean_test_score') ax = mean_scores.plot(legend=False, alpha=.6) -r_i_list = results.groupby('iter')['resource_iter'].unique() labels = [ - f'iter={i}\nn_samples={r_i_list[i][0]}\n' + f'iter={i}\nn_samples={rsh.n_resources_[i]}\n' f'n_candidates={rsh.n_candidates_[i]}' for i in range(rsh.n_iterations_) ] diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index acef3384082d4..e4f3e6be9772b 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -106,7 +106,7 @@ def _check_input_parameters(self, X, y, groups): ) if self.n_candidates != 'exhaust' and ( not isinstance(self.n_candidates, Integral) or - self.n_candidates<= 0): + self.n_candidates <= 0): raise ValueError( "n_candidates must be either 'exhaust' " "or a positive integer" @@ -227,8 +227,7 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): print(f'aggressive_elimination: {self.aggressive_elimination}') print(f'ratio: {self.ratio}') - # list of resource_iter for each iteration, used in tests - self._r_i_list = [] + self.n_resources_ = [] self.n_candidates_ = [] for iter_i in range(n_iterations): @@ -247,7 +246,7 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): resource_iter = int(self.ratio**power * self.min_resources_) # guard, probably not needed resource_iter = min(resource_iter, self.max_resources_) - self._r_i_list.append(resource_iter) + self.n_resources_.append(resource_iter) n_candidates = len(candidate_params) self.n_candidates_.append(n_candidates) @@ -456,6 +455,9 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): The number of candidate parameters that were evaluated at each iteration. + n_resources_ : list of int + The amount of resources used at each iteration. + n_remaining_candidates_ : int The number of candidate parameters that are left after the last iteration. @@ -741,6 +743,9 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): The number of candidate parameters that were evaluated at each iteration. + n_resources_ : list of int + The amount of resources used at each iteration. + n_remaining_candidates_ : int The number of candidate parameters that are left after the last iteration. diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 64e9be6cf6e11..c285b9bdf1a7c 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -33,7 +33,7 @@ def get_params(self, deep=False): 'expected_n_required_iterations,' 'expected_n_possible_iterations,' 'expected_n_remaining_candidates,' - 'expected_r_i_list,'), [ + 'expected_n_resources,'), [ # notice how it loops at the beginning (True, 'limited', 4, 4, 3, 1, [20, 20, 60, 180]), # no aggressive elimination: we end up with less iterations and more @@ -49,7 +49,7 @@ def get_params(self, deep=False): def test_aggressive_elimination( Est, aggressive_elimination, max_resources, expected_n_iterations, expected_n_required_iterations, expected_n_possible_iterations, - expected_n_remaining_candidates, expected_r_i_list): + expected_n_remaining_candidates, expected_n_resources): # Test the aggressive_elimination parameter. n_samples = 1000 @@ -76,7 +76,7 @@ def test_aggressive_elimination( assert sh.n_iterations_ == expected_n_iterations assert sh.n_required_iterations_ == expected_n_required_iterations assert sh.n_possible_iterations_ == expected_n_possible_iterations - assert sh._r_i_list == expected_r_i_list + assert sh.n_resources_ == expected_n_resources assert sh.n_remaining_candidates_ == expected_n_remaining_candidates @@ -86,7 +86,7 @@ def test_aggressive_elimination( 'max_resources,' 'expected_n_iterations,' 'expected_n_possible_iterations,' - 'expected_r_i_list,'), [ + 'expected_n_resources,'), [ # with enough resources ('smallest', 'auto', 2, 4, [20, 60]), # with enough resources but min_resources set manually @@ -108,7 +108,7 @@ def test_aggressive_elimination( def test_min_max_resources( Est, min_resources, max_resources, expected_n_iterations, expected_n_possible_iterations, - expected_r_i_list): + expected_n_resources): # Test the min_resources and max_resources parameters, and how they affect # the number of resources used at each iteration n_samples = 1000 @@ -127,10 +127,10 @@ def test_min_max_resources( assert sh.n_iterations_ == expected_n_iterations assert sh.n_required_iterations_ == expected_n_required_iterations assert sh.n_possible_iterations_ == expected_n_possible_iterations - assert sh._r_i_list == expected_r_i_list + assert sh.n_resources_ == expected_n_resources if min_resources == 'exhaust': assert (sh.n_possible_iterations_ == sh.n_iterations_ == - len(sh._r_i_list)) + len(sh.n_resources_)) @pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) @@ -147,8 +147,7 @@ def test_min_max_resources( (4, 1, 1), # max_resources == min_resources, only one iteration is # possible ]) -def test_n_iterations(Est, max_resources, n_iterations, - n_possible_iterations): +def test_n_iterations(Est, max_resources, n_iterations, n_possible_iterations): # test the number of actual iterations that were run depending on # max_resources @@ -179,7 +178,7 @@ def test_resource_parameter(Est): sh = Est(base_estimator, parameters, cv=2, resource='c', max_resources=10, ratio=3) sh.fit(X, y) - assert set(sh._r_i_list) == set([1, 3, 9]) + assert set(sh.n_resources_) == set([1, 3, 9]) for r_i, params, param_c in zip(sh.cv_results_['resource_iter'], sh.cv_results_['params'], sh.cv_results_['param_c']): @@ -227,7 +226,7 @@ def test_random_search(max_resources, n_candidates, expected_n_candidates_): if n_candidates == 'exhaust': # Make sure 'exhaust' makes the last iteration use as much resources as # we can - assert sh._r_i_list[-1] == max_resources + assert sh.n_resources_[-1] == max_resources @pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) @@ -281,8 +280,8 @@ def test_input_errors(Est, params, expected_error_message): @pytest.mark.parametrize('params, expected_error_message', [ ({'n_candidates': 'exhaust', 'min_resources': 'exhaust'}, "cannot be both set to 'exhaust'"), - ({'n_candidates': 'bad'}, "either 'exhaust' or a positive integer"), - ({'n_candidates': 0}, "either 'exhaust' or a positive integer"), + ({'n_candidates': 'bad'}, "either 'exhaust' or a positive integer"), + ({'n_candidates': 0}, "either 'exhaust' or a positive integer"), ]) def test_input_errors_randomized(params, expected_error_message): # tests specific to HalvingRandomSearchCV From 08dd96e49e8796152579bc21e2714181885bdff3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 30 Jul 2020 09:08:58 -0400 Subject: [PATCH 55/89] update examples --- .../plot_successive_halving_heatmap.py | 24 ++++++++++++------- .../plot_successive_halving_iterations.py | 1 + 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index c46fdcc6f3197..6f43ff367a6a2 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -60,13 +60,21 @@ # We now plot heatmaps for both search estimators. -def make_heatmap(ax, gs, show_iter=False, make_cbar=False): +def make_heatmap(ax, gs, is_sh=False, make_cbar=False): """Helper to make a heatmap.""" results = pd.DataFrame.from_dict(gs.cv_results_) results['params_str'] = results.params.apply(str) - # Take max but there's only one value anyway - scores = results.groupby(['param_gamma', 'param_C']).mean_test_score.max() - scores_matrix = scores.values.reshape(len(gammas), len(Cs)) + if is_sh: + # SH dataframe: get mean_test_score values for the highest iter + scores_matrix = ( + results.sort_values('iter').groupby(['param_gamma', 'param_C']) + .last()['mean_test_score'].unstack() + ) + else: + scores_matrix = ( + results.set_index(['param_gamma', 'param_C'])['mean_test_score'] + .unstack() + ) im = ax.imshow(scores_matrix) @@ -82,9 +90,9 @@ def make_heatmap(ax, gs, show_iter=False, make_cbar=False): plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") - if show_iter: - iterations = results.groupby(['param_gamma', 'param_C']).iter.max() - iterations_matrix = iterations.values.reshape(len(gammas), len(Cs)) + if is_sh: + iterations = results.groupby(['param_gamma', 'param_C'])['iter'].max() + iterations_matrix = iterations.unstack().values for i in range(len(gammas)): for j in range(len(Cs)): ax.text(j, i, iterations_matrix[i, j], @@ -101,7 +109,7 @@ def make_heatmap(ax, gs, show_iter=False, make_cbar=False): fig, axes = plt.subplots(ncols=2, sharey=True) ax1, ax2 = axes -make_heatmap(ax1, gsh, show_iter=True) +make_heatmap(ax1, gsh, is_sh=True) make_heatmap(ax2, gs, make_cbar=True) ax1.set_title('Successive Halving\ntime = {:.3f}s'.format(gsh_time), diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 638eabc434131..35d78206953e8 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -67,6 +67,7 @@ ax.set_title('Scores of candidates over iterations') ax.set_ylabel('mean test score', fontsize=15) ax.set_xlabel('iterations', fontsize=15) +plt.tight_layout() plt.show() # %% From c3ee547f5436cb1cd38c533e033ff6c4d7b3b429 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 2 Aug 2020 10:02:23 -0400 Subject: [PATCH 56/89] Addressed comments --- .../_search_successive_halving.py | 79 +++++++++---------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index e4f3e6be9772b..4e9f50c9f1bad 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -22,10 +22,9 @@ def _refit_callable(results): # would return the best candidate out of all iterations. last_iter = np.max(results['iter']) - sorted_indices = np.argsort(results['mean_test_score'])[::-1] - best_index = next(i for i in sorted_indices - if results['iter'][i] == last_iter) - return best_index + last_iter_indices = np.flatnonzero(results['iter'] == last_iter) + best_idx = np.argmax(results['mean_test_score'][last_iter_indices]) + return last_iter_indices[best_idx] class BaseSuccessiveHalving(BaseSearchCV): @@ -528,6 +527,24 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): This is present only if ``refit`` is not False. + See Also + -------- + :class:`HalvingRandomSearchCV`: + Random search over a set of parameters using successive halving. + + Notes + ----- + The parameters selected are those that maximize the score of the held-out + data, according to the scoring parameter. + + If `n_jobs` was set to a value higher than one, the data is copied for each + parameter setting(and not `n_jobs` times). This is done for efficiency + reasons if individual jobs take very little time, but may raise errors if + the dataset is large and not enough memory is available. A workaround in + this case is to set `pre_dispatch`. Then, the memory is copied only + `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * + n_jobs`. + Examples -------- @@ -545,24 +562,6 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): ... random_state=0).fit(X, y) >>> search.best_params_ # doctest: +SKIP {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} - - Notes - ----- - The parameters selected are those that maximize the score of the held-out - data, according to the scoring parameter. - - If `n_jobs` was set to a value higher than one, the data is copied for each - parameter setting(and not `n_jobs` times). This is done for efficiency - reasons if individual jobs take very little time, but may raise errors if - the dataset is large and not enough memory is available. A workaround in - this case is to set `pre_dispatch`. Then, the memory is copied only - `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * - n_jobs`. - - See Also - -------- - :class:`HalvingRandomSearchCV`: - Random search over a set of parameters using successive halving. """ _required_parameters = ["estimator", "param_grid"] @@ -816,6 +815,24 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): This is present only if ``refit`` is not False. + See Also + -------- + :class:`HalvingGridSearchCV`: + Search over a grid of parameters using successive halving. + + Notes + ----- + The parameters selected are those that maximize the score of the held-out + data, according to the scoring parameter. + + If `n_jobs` was set to a value higher than one, the data is copied for each + parameter setting(and not `n_jobs` times). This is done for efficiency + reasons if individual jobs take very little time, but may raise errors if + the dataset is large and not enough memory is available. A workaround in + this case is to set `pre_dispatch`. Then, the memory is copied only + `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * + n_jobs`. + Examples -------- @@ -836,24 +853,6 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): ... random_state=0).fit(X, y) >>> search.best_params_ # doctest: +SKIP {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} - - Notes - ----- - The parameters selected are those that maximize the score of the held-out - data, according to the scoring parameter. - - If `n_jobs` was set to a value higher than one, the data is copied for each - parameter setting(and not `n_jobs` times). This is done for efficiency - reasons if individual jobs take very little time, but may raise errors if - the dataset is large and not enough memory is available. A workaround in - this case is to set `pre_dispatch`. Then, the memory is copied only - `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * - n_jobs`. - - See Also - -------- - :class:`HalvingGridSearchCV`: - Search over a grid of parameters using successive halving. """ _required_parameters = ["estimator", "param_distributions"] From 31d8195c9eec4523a4f85098e9a789fe5bfe88c1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 10 Aug 2020 12:43:46 -0400 Subject: [PATCH 57/89] passing CV instead of X,y --- sklearn/model_selection/_search.py | 24 ++++---- .../_search_successive_halving.py | 55 ++++++++++++------- sklearn/model_selection/tests/test_search.py | 7 +-- .../tests/test_successive_halving.py | 20 +++---- 4 files changed, 59 insertions(+), 47 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 68705db717c7f..416935529391b 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -621,7 +621,7 @@ def classes_(self): self._check_is_fitted("classes_") return self.best_estimator_.classes_ - def _run_search(self, evaluate_candidates, X, y, **fit_params): + def _run_search(self, evaluate_candidates): """Repeatedly calls `evaluate_candidates` to conduct a search. This method, implemented in sub-classes, makes it possible to @@ -712,8 +712,8 @@ def fit(self, X, y=None, *, groups=None, **fit_params): Parameters passed to the ``fit`` method of the estimator """ estimator = self.estimator - cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) - + self._checked_cv_orig = check_cv(self.cv, y, + classifier=is_classifier(estimator)) refit_metric = "score" if callable(self.scoring): @@ -728,7 +728,7 @@ def fit(self, X, y=None, *, groups=None, **fit_params): X, y, groups = indexable(X, y, groups) fit_params = _check_fit_params(X, fit_params) - n_splits = cv.get_n_splits(X, y, groups) + n_splits = self._checked_cv_orig.get_n_splits(X, y, groups) base_estimator = clone(self.estimator) @@ -748,9 +748,9 @@ def fit(self, X, y=None, *, groups=None, **fit_params): all_out = [] all_more_results = defaultdict(list) - def evaluate_candidates(candidate_params, X, y, - more_results=None, - **fit_params): + def evaluate_candidates(candidate_params, cv=None, + more_results=None): + cv = cv or self._checked_cv_orig candidate_params = list(candidate_params) n_candidates = len(candidate_params) @@ -806,7 +806,7 @@ def evaluate_candidates(candidate_params, X, y, return results - self._run_search(evaluate_candidates, X, y, **fit_params) + self._run_search(evaluate_candidates) # multimetric is determined here because in the case of a callable # self.scoring the return type is only known after calling @@ -1243,9 +1243,9 @@ def __init__(self, estimator, param_grid, *, scoring=None, self.param_grid = param_grid _check_param_grid(param_grid) - def _run_search(self, evaluate_candidates, X, y, **fit_params): + def _run_search(self, evaluate_candidates): """Search all candidates in param_grid""" - evaluate_candidates(ParameterGrid(self.param_grid), X, y, **fit_params) + evaluate_candidates(ParameterGrid(self.param_grid)) class RandomizedSearchCV(BaseSearchCV): @@ -1575,8 +1575,8 @@ def __init__(self, estimator, param_distributions, *, n_iter=10, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score) - def _run_search(self, evaluate_candidates, X, y, **fit_params): + def _run_search(self, evaluate_candidates): """Search n_iter candidates from param_distributions""" evaluate_candidates(ParameterSampler( self.param_distributions, self.n_iter, - random_state=self.random_state), X, y, **fit_params) + random_state=self.random_state)) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 4e9f50c9f1bad..eb65d1041b783 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -6,8 +6,8 @@ from ._search import _check_param_grid from ._search import BaseSearchCV from . import ParameterGrid, ParameterSampler -from ..utils import check_random_state, _safe_indexing -from ..utils.validation import _num_samples, _check_fit_params +from ..utils import check_random_state +from ..utils.validation import _num_samples from ..base import is_classifier from ._split import check_cv from ..utils import resample @@ -16,6 +16,22 @@ __all__ = ['HalvingGridSearchCV', 'HalvingRandomSearchCV'] +class _TrainingFractionMetaSplitter: + """Splitter that subsamples the trainsets according to a given fraction""" + def __init__(self, *, base_cv, fraction, random_state): + self.base_cv = base_cv + self.fraction = fraction + self.random_state = check_random_state(random_state) + + def split(self, X, y, groups=None): + for train_idx, test_idx in self.base_cv.split(X, y, groups): + train_idx = resample( + train_idx, replace=False, random_state=self.random_state, + n_samples=int(self.fraction * train_idx.shape[0]) + ) + yield train_idx, test_idx + + def _refit_callable(results): # Custom refit callable to return the index of the best candidate. We want # the best candidate out of the last iteration. By default BaseSearchCV @@ -57,8 +73,8 @@ def __init__(self, estimator, *, scoring=None, def _check_input_parameters(self, X, y, groups): - if groups is not None: - raise ValueError('groups are not supported.') + # if groups is not None: + # raise ValueError('groups are not supported.') if self.scoring is not None and not (isinstance(self.scoring, str) or callable(self.scoring)): @@ -117,7 +133,6 @@ def _check_input_parameters(self, X, y, groups): cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) n_splits = cv.get_n_splits(X, y, groups) - # please see https://gph.is/1KjihQe for a justification magic_factor = 2 self.min_resources_ = n_splits * magic_factor @@ -168,6 +183,8 @@ def fit(self, X, y=None, groups=None, **fit_params): groups=groups, ) + self._n_samples_orig = _num_samples(X) + super().fit(X, y=y, groups=None, **fit_params) # Set best_score_: BaseSearchCV does not set it, as refit is a callable @@ -176,9 +193,7 @@ def fit(self, X, y=None, groups=None, **fit_params): return self - def _run_search(self, evaluate_candidates, X, y, **fit_params): - rng = check_random_state(self.random_state) - + def _run_search(self, evaluate_candidates): candidate_params = self._generate_candidate_params() if self.resource != 'n_samples' and any( @@ -257,14 +272,11 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): print(f'resource_iter: {resource_iter}') if self.resource == 'n_samples': - # Subsample X and y as well as fit_params - stratify = y if is_classifier(self.estimator) else None - indices = resample(np.arange(X.shape[0]), replace=False, - random_state=rng, stratify=stratify, - n_samples=resource_iter) - X_iter = _safe_indexing(X, indices) - y_iter = _safe_indexing(y, indices) - fit_params_iter = _check_fit_params(X, fit_params, indices) + # subsampling will be done in cv.split() + cv = _TrainingFractionMetaSplitter( + base_cv=self._checked_cv_orig, + fraction=resource_iter / self._n_samples_orig, + random_state=self.random_state) else: # Need copy so that the resource_iter of next iteration does @@ -272,14 +284,15 @@ def _run_search(self, evaluate_candidates, X, y, **fit_params): candidate_params = [c.copy() for c in candidate_params] for candidate in candidate_params: candidate[self.resource] = resource_iter - X_iter, y_iter = X, y - fit_params_iter = fit_params + cv = self._checked_cv_orig more_results = {'iter': [iter_i] * n_candidates, 'resource_iter': [resource_iter] * n_candidates} - results = evaluate_candidates(candidate_params, X_iter, y_iter, - more_results=more_results, - **fit_params_iter) + # results = evaluate_candidates(candidate_params, X_iter, y_iter, + # more_results=more_results, + # **fit_params_iter) + results = evaluate_candidates(candidate_params, cv, + more_results=more_results) n_candidates_to_keep = ceil(n_candidates / self.ratio) candidate_params = self._top_k(results, diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 99617e234d006..73207ab956ac3 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1658,12 +1658,11 @@ class CustomSearchCV(BaseSearchCV): def __init__(self, estimator, **kwargs): super().__init__(estimator, **kwargs) - def _run_search(self, evaluate, X, y): - results = evaluate([{'max_depth': 1}, {'max_depth': 2}], - X, y) + def _run_search(self, evaluate): + results = evaluate([{'max_depth': 1}, {'max_depth': 2}]) check_results(results, fit_grid({'max_depth': [1, 2]})) results = evaluate([{'min_samples_split': 5}, - {'min_samples_split': 10}], X, y) + {'min_samples_split': 10}]) check_results(results, fit_grid([{'max_depth': [1, 2]}, {'min_samples_split': [5, 10]}])) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index c285b9bdf1a7c..593d4ba861e12 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -229,16 +229,16 @@ def test_random_search(max_resources, n_candidates, expected_n_candidates_): assert sh.n_resources_[-1] == max_resources -@pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) -def test_groups_not_supported(Est): - base_estimator = FastClassifier() - param_grid = {'a': [1]} - sh = Est(base_estimator, param_grid) - X, y = make_classification(n_samples=10) - groups = [0] * 10 - - with pytest.raises(ValueError, match="groups are not supported"): - sh.fit(X, y, groups) +# @pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) +# def test_groups_not_supported(Est): +# base_estimator = FastClassifier() +# param_grid = {'a': [1]} +# sh = Est(base_estimator, param_grid) +# X, y = make_classification(n_samples=10) +# groups = [0] * 10 + +# with pytest.raises(ValueError, match="groups are not supported"): +# sh.fit(X, y, groups) @pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV)) From cdebb6e470c97a380abf394054793c252f4a2da2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 10 Aug 2020 13:04:22 -0400 Subject: [PATCH 58/89] minor revert for handling fit_params --- sklearn/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 416935529391b..c3f7f179a802d 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -736,6 +736,7 @@ def fit(self, X, y=None, *, groups=None, **fit_params): pre_dispatch=self.pre_dispatch) fit_and_score_kwargs = dict(scorer=scorers, + fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, @@ -759,7 +760,6 @@ def evaluate_candidates(candidate_params, cv=None, " totalling {2} fits".format( n_splits, n_candidates, n_candidates * n_splits)) - fit_and_score_kwargs['fit_params'] = fit_params out = parallel(delayed(_fit_and_score)(clone(base_estimator), X, y, train=train, test=test, From 05070933dac8ed148ebca46c8c5d543d3a790a56 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 17 Aug 2020 12:45:59 -0400 Subject: [PATCH 59/89] updated docs --- sklearn/model_selection/_search.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index c3f7f179a802d..f16186f9e1312 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -644,12 +644,15 @@ def _run_search(self, evaluate_candidates): This callback accepts: - a list of candidates, where each candidate is a dict of parameter settings. - - the samples `X` - - the targets `y` + - an optional `cv` parameter which can be used to e.g. + evaluate candidates on different dataset splits, or + evaluate candidates on subsampled data (as done in the + SucessiveHaling estimators). By default, the original `cv` + parameter is used, and it is available as a private + `_checked_cv_orig` attribute. - an optional `more_results` dict. Each key will be added to the `cv_results_` attribute. Values should be lists of length `n_candidates` - - a **fit_params keyword It returns a dict of all results so far, formatted like ``cv_results_``. From be877562460c5172f130c1d1bca0d547e8f74df0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Aug 2020 10:01:24 -0400 Subject: [PATCH 60/89] fix len --- doc/whats_new/v0.24.rst | 4 ++++ sklearn/model_selection/_search.py | 21 +++++++++++++------- sklearn/model_selection/tests/test_search.py | 1 + 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index aaf86a2f0576d..6566015a7e922 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -328,6 +328,10 @@ Changelog :pr:`17478` by :user:`Teon Brooks ` and :user:`Mohamed Maskani `. +- |Fix| Fixed the `len` of :class:`model_selection.ParameterSampler` when + all distributions are lists and `n_iter` is more than the number of unique + paramter combinations. + :mod:`sklearn.multiclass` ......................... diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index efc230ac080f2..a2735207da48e 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -263,15 +263,18 @@ def __init__(self, param_distributions, n_iter, *, random_state=None): self.random_state = random_state self.param_distributions = param_distributions - def __iter__(self): - # check if all distributions are given as lists - # in this case we want to sample without replacement - all_lists = all( + def _is_all_lists(self): + return all( all(not hasattr(v, "rvs") for v in dist.values()) - for dist in self.param_distributions) + for dist in self.param_distributions + ) + + def __iter__(self): rng = check_random_state(self.random_state) - if all_lists: + # if all distributions are given as lists, we want to sample without + # replacement + if self._is_all_lists(): # look up sampled parameter settings in parameter grid param_grid = ParameterGrid(self.param_distributions) grid_size = len(param_grid) @@ -303,7 +306,11 @@ def __iter__(self): def __len__(self): """Number of points that will be sampled.""" - return self.n_iter + if self._is_all_lists(): + grid_size = len(ParameterGrid(self.param_distributions)) + return min(self.n_iter, grid_size) + else: + return self.n_iter # FIXME Remove fit_grid_point in 0.25 diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 73207ab956ac3..065f256715f8a 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1489,6 +1489,7 @@ def test_parameters_sampler_replacement(): assert len(samples) == 8 for values in ParameterGrid(params): assert values in samples + assert len(ParameterSampler(params, n_iter=1000)) == 8 # test sampling without replacement in a large grid params = {'a': range(10), 'b': range(10), 'c': range(10)} From beda557caad7b89bd9cd135775dd99ef4c080bf6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Aug 2020 10:04:44 -0400 Subject: [PATCH 61/89] whatsnew --- doc/whats_new/v0.24.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 6566015a7e922..2d608db393330 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -330,7 +330,7 @@ Changelog - |Fix| Fixed the `len` of :class:`model_selection.ParameterSampler` when all distributions are lists and `n_iter` is more than the number of unique - paramter combinations. + parameter combinations. :pr:`18222` by `Nicolas Hug`_. :mod:`sklearn.multiclass` ......................... From d807d267b9eccad0902429cb85b0dedd72a007be Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Aug 2020 10:32:46 -0400 Subject: [PATCH 62/89] Add test for sampling when all_list --- .../tests/test_successive_halving.py | 36 ++++++++++++------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 593d4ba861e12..2d995017ce2b8 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -1,5 +1,5 @@ import pytest -from scipy.stats import norm +from scipy.stats import norm, randint from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier @@ -202,14 +202,14 @@ def test_resource_parameter(Est): @pytest.mark.parametrize( - 'max_resources, n_candidates, expected_n_candidates_', [ + 'max_resources, n_candidates, expected_n_candidates', [ (512, 'exhaust', 128), # generate exactly as much as needed (32, 'exhaust', 8), (32, 8, 8), (32, 7, 7), # ask for less than what we could (32, 9, 9), # ask for more than 'reasonable' ]) -def test_random_search(max_resources, n_candidates, expected_n_candidates_): +def test_random_search(max_resources, n_candidates, expected_n_candidates): # Test random search and make sure the number of generated candidates is # as expected @@ -222,23 +222,33 @@ def test_random_search(max_resources, n_candidates, expected_n_candidates_): max_resources=max_resources, ratio=2, min_resources=4) sh.fit(X, y) - assert sh.n_candidates_[0] == expected_n_candidates_ + assert sh.n_candidates_[0] == expected_n_candidates if n_candidates == 'exhaust': # Make sure 'exhaust' makes the last iteration use as much resources as # we can assert sh.n_resources_[-1] == max_resources -# @pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) -# def test_groups_not_supported(Est): -# base_estimator = FastClassifier() -# param_grid = {'a': [1]} -# sh = Est(base_estimator, param_grid) -# X, y = make_classification(n_samples=10) -# groups = [0] * 10 +@pytest.mark.parametrize('param_distributions, expected_n_candidates', [ + ({'a': [1, 2]}, 2), # all lists, sample less than n_candidates + ({'a': randint(1, 3)}, 10), # not all list, respect n_candidates +]) +def test_random_search_discrete_distributions(param_distributions, + expected_n_candidates): + # Make sure random search samples the appropriate number of candidates when + # we ask for more than what's possible. How many parameters are sampled + # depends whether the distributions are 'all lists' or not (see + # ParameterSampler for details). This is somewhat redundant with the checks + # in ParameterSampler but interaction bugs were discovered during + # developement of SH -# with pytest.raises(ValueError, match="groups are not supported"): -# sh.fit(X, y, groups) + n_samples = 1024 + X, y = make_classification(n_samples=n_samples, random_state=0) + base_estimator = FastClassifier() + sh = HalvingRandomSearchCV(base_estimator, param_distributions, + n_candidates=10) + sh.fit(X, y) + assert sh.n_candidates_[0] == expected_n_candidates @pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV)) From 0350176fac295c261f3b2b07f136c39290c9530a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Aug 2020 10:57:21 -0400 Subject: [PATCH 63/89] minor change to top-k --- .../model_selection/_search_successive_halving.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index eb65d1041b783..b764ee0354b6b 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -220,7 +220,7 @@ def _run_search(self, evaluate_candidates): ) # n_possible_iterations is the number of iterations that we can - # actually do starting from min_resources and without exceeding the + # actually do starting from min_resources and without exceeding # max_resources. Depending on max_resources and the number of # candidates, this may be higher or smaller than # n_required_iterations. @@ -288,9 +288,7 @@ def _run_search(self, evaluate_candidates): more_results = {'iter': [iter_i] * n_candidates, 'resource_iter': [resource_iter] * n_candidates} - # results = evaluate_candidates(candidate_params, X_iter, y_iter, - # more_results=more_results, - # **fit_params_iter) + results = evaluate_candidates(candidate_params, cv, more_results=more_results) @@ -308,12 +306,9 @@ def _top_k(self, results, k, iter_i): # Return the best candidates of a given iteration # We need to filter out candidates from the previous iterations # when sorting - - best_candidates_indices = np.argsort(results['mean_test_score'])[::-1] - best_candidates_indices = [idx for idx in best_candidates_indices - if results['iter'][idx] == iter_i] - best_candidates_indices = best_candidates_indices[:k] - return [results['params'][idx] for idx in best_candidates_indices] + iter_indices = np.flatnonzero(np.array(results['iter']) == iter_i) + sorted_indices = np.argsort(results['mean_test_score'][iter_indices]) + return np.array(results['params'])[sorted_indices[-k:]] @abstractmethod def _generate_candidate_params(self): From 0bc44a1dfc314365f223dd473d986542945de7d6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Aug 2020 16:21:24 -0400 Subject: [PATCH 64/89] Force CV splits to be consistent across calls --- .../_search_successive_halving.py | 66 +++++++++++++++---- sklearn/model_selection/_split.py | 7 ++ sklearn/model_selection/tests/test_split.py | 39 +++++++++++ .../tests/test_successive_halving.py | 63 ++++++++++++++++++ sklearn/utils/__init__.py | 3 +- 5 files changed, 164 insertions(+), 14 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index b764ee0354b6b..32939d74f0940 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -6,22 +6,22 @@ from ._search import _check_param_grid from ._search import BaseSearchCV from . import ParameterGrid, ParameterSampler -from ..utils import check_random_state from ..utils.validation import _num_samples from ..base import is_classifier -from ._split import check_cv +from ._split import check_cv, _yields_constant_splits from ..utils import resample __all__ = ['HalvingGridSearchCV', 'HalvingRandomSearchCV'] -class _TrainingFractionMetaSplitter: - """Splitter that subsamples the trainsets according to a given fraction""" - def __init__(self, *, base_cv, fraction, random_state): +class _SubsampleMetaSplitter: + """Splitter that subsamples a given fraction of the dataset""" + def __init__(self, *, base_cv, fraction, subsample_test, random_state): self.base_cv = base_cv self.fraction = fraction - self.random_state = check_random_state(random_state) + self.subsample_test = subsample_test + self.random_state = random_state def split(self, X, y, groups=None): for train_idx, test_idx in self.base_cv.split(X, y, groups): @@ -29,6 +29,11 @@ def split(self, X, y, groups=None): train_idx, replace=False, random_state=self.random_state, n_samples=int(self.fraction * train_idx.shape[0]) ) + if self.subsample_test: + test_idx = resample( + test_idx, replace=False, random_state=self.random_state, + n_samples=int(self.fraction * test_idx.shape[0]) + ) yield train_idx, test_idx @@ -73,15 +78,22 @@ def __init__(self, estimator, *, scoring=None, def _check_input_parameters(self, X, y, groups): - # if groups is not None: - # raise ValueError('groups are not supported.') - if self.scoring is not None and not (isinstance(self.scoring, str) or callable(self.scoring)): raise ValueError('scoring parameter must be a string, ' 'a callable or None. Multimetric scoring is not ' 'supported.') + # We need to enforce that successive calls to cv.split() yield the same + # splits: see https://github.com/scikit-learn/scikit-learn/issues/15149 + cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) + if not _yields_constant_splits(cv): + raise ValueError( + "The cv parameter must yield consistent folds across " + "calls to split(). Set its random_state to an int, or set " + "shuffle=False." + ) + if (self.resource != 'n_samples' and self.resource not in self.estimator.get_params()): raise ValueError( @@ -130,8 +142,6 @@ def _check_input_parameters(self, X, y, groups): self.min_resources_ = self.min_resources if self.min_resources_ in ('smallest', 'exhaust'): if self.resource == 'n_samples': - cv = check_cv(self.cv, y, - classifier=is_classifier(self.estimator)) n_splits = cv.get_n_splits(X, y, groups) # please see https://gph.is/1KjihQe for a justification magic_factor = 2 @@ -273,10 +283,12 @@ def _run_search(self, evaluate_candidates): if self.resource == 'n_samples': # subsampling will be done in cv.split() - cv = _TrainingFractionMetaSplitter( + cv = _SubsampleMetaSplitter( base_cv=self._checked_cv_orig, fraction=resource_iter / self._n_samples_orig, - random_state=self.random_state) + subsample_test=True, + random_state=self.random_state + ) else: # Need copy so that the resource_iter of next iteration does @@ -349,6 +361,12 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): ``-1`` means using all processors. See :term:`Glossary ` for more details. + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for subsampling the dataset + when `resources != 'n_samples'`. Ignored otherwise. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an @@ -381,6 +399,13 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. + .. note:: + Due to implementation details, the folds produced by `cv` must be + the same across multiple calls to `cv.split()`. For + built-in `scikit-learn` iterators, this can be achieved by + deactivating shuffling (`shuffle=False`), or by setting the + `cv`'s `random_state` parameter to an integer. + refit : boolean, default=True If True, refit an estimator using the best found parameters on the whole dataset. @@ -654,6 +679,14 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' (default) + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for subsampling the dataset + when `resources != 'n_samples'`. Also used for random uniform + sampling from lists of possible values instead of scipy.stats + distributions. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + cv : int, cross-validation generator or an iterable, default=5 Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -669,6 +702,13 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. + .. note:: + Due to implementation details, the folds produced by `cv` must be + the same across multiple calls to `cv.split()`. For + built-in `scikit-learn` iterators, this can be achieved by + deactivating shuffling (`shuffle=False`), or by setting the + `cv`'s `random_state` parameter to an integer. + refit : boolean, default=True If True, refit an estimator using the best found parameters on the whole dataset. diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index e6604eb35db73..9fb69603e5d59 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -2235,3 +2235,10 @@ def _build_repr(self): params[key] = value return '%s(%s)' % (class_name, _pprint(params, offset=len(class_name))) + + +def _yields_constant_splits(cv): + # return True if calling cv.split() always returns the same splits + shuffle = getattr(cv, 'shuffle', True) + random_state = getattr(cv, 'random_state', 0) + return isinstance(random_state, numbers.Integral) or not shuffle diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 4250eb8af8748..f3eb75a47f01e 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1619,3 +1619,42 @@ def test_random_state_shuffle_false(Klass): with pytest.raises(ValueError, match='has no effect since shuffle is False'): Klass(3, shuffle=False, random_state=0) + + +@pytest.mark.parametrize('cv, expected', [ + (KFold(), True), + (KFold(shuffle=True, random_state=123), True), + (StratifiedKFold(), True), + (StratifiedKFold(shuffle=True, random_state=123), True), + (RepeatedKFold(random_state=123), True), + (RepeatedStratifiedKFold(random_state=123), True), + (ShuffleSplit(random_state=123), True), + (GroupShuffleSplit(random_state=123), True), + (StratifiedShuffleSplit(random_state=123), True), + (GroupKFold(), True), + (TimeSeriesSplit(), True), + (LeaveOneOut(), True), + (LeaveOneGroupOut(), True), + (LeavePGroupsOut(n_groups=2), True), + (LeavePOut(p=2), True), + + (KFold(shuffle=True, random_state=None), False), + (KFold(shuffle=True, random_state=None), False), + (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), + False), + (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), + False), + (RepeatedKFold(random_state=None), False), + (RepeatedKFold(random_state=np.random.RandomState(0)), False), + (RepeatedStratifiedKFold(random_state=None), False), + (RepeatedStratifiedKFold(random_state=np.random.RandomState(0)), False), + (ShuffleSplit(random_state=None), False), + (ShuffleSplit(random_state=np.random.RandomState(0)), False), + (GroupShuffleSplit(random_state=None), False), + (GroupShuffleSplit(random_state=np.random.RandomState(0)), False), + (StratifiedShuffleSplit(random_state=None), False), + (StratifiedShuffleSplit(random_state=np.random.RandomState(0)), False), +]) +def test_yields_constant_splits(cv, expected): + from sklearn.model_selection._split import _yields_constant_splits + assert _yields_constant_splits(cv) == expected diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 2d995017ce2b8..5d8ce1234b897 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -1,10 +1,14 @@ import pytest from scipy.stats import norm, randint +import numpy as np from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier from sklearn.model_selection import HalvingGridSearchCV from sklearn.model_selection import HalvingRandomSearchCV +from sklearn.model_selection import KFold, ShuffleSplit +from sklearn.model_selection._search_successive_halving import ( + _SubsampleMetaSplitter) class FastClassifier(DummyClassifier): @@ -275,6 +279,8 @@ def test_random_search_discrete_distributions(param_distributions, "max_resources can only be 'auto' if resource='n_samples'"), ({'min_resources': 15, 'max_resources': 14}, "min_resources_=15 is greater than max_resources_=14"), + ({'cv': KFold(shuffle=True)}, "must yield consistent folds"), + ({'cv': ShuffleSplit()}, "must yield consistent folds"), ]) def test_input_errors(Est, params, expected_error_message): base_estimator = FastClassifier() @@ -304,3 +310,60 @@ def test_input_errors_randomized(params, expected_error_message): with pytest.raises(ValueError, match=expected_error_message): sh.fit(X, y) + + +@pytest.mark.parametrize( + 'fraction, subsample_test, expected_train_size, expected_test_size', [ + (.5, True, 40, 10), + (.5, False, 40, 20), + (.2, True, 16, 4), + (.2, False, 16, 20)]) +def test_subsample_splitter_shapes(fraction, subsample_test, + expected_train_size, expected_test_size): + # Make sure splits returned by SubsampleMetaSplitter are of appropriate + # size + + n_samples = 100 + X, y = make_classification(n_samples) + cv = _SubsampleMetaSplitter(base_cv=KFold(5), fraction=fraction, + subsample_test=subsample_test, + random_state=None) + + for train, test in cv.split(X, y): + assert train.shape[0] == expected_train_size + assert test.shape[0] == expected_test_size + if subsample_test: + assert train.shape[0] + test.shape[0] == int(n_samples * fraction) + else: + assert test.shape[0] == n_samples // cv.base_cv.get_n_splits() + + +@pytest.mark.parametrize('subsample_test', (True, False)) +def test_subsample_splitter_determinism(subsample_test): + # Make sure _SubsampleMetaSplitter is consistent across calls to split(): + # - we're OK having training sets differ (they're always samples with a + # different fraction anyway) + # - when we don't subsample the test set, we want it to be always the same. + # This check is the most important. This is ensured by the determinism + # of the base_cv. + + # Note: we could force both train and test splits to be always the same if + # we drew an int seed in _SubsampleMetaSplitter.__init__ + + n_samples = 100 + X, y = make_classification(n_samples) + cv = _SubsampleMetaSplitter(base_cv=KFold(5), fraction=.5, + subsample_test=subsample_test, + random_state=None) + + folds_a = list(cv.split(X, y, groups=None)) + folds_b = list(cv.split(X, y, groups=None)) + + for (train_a, test_a), (train_b, test_b) in zip(folds_a, folds_b): + assert not np.all(train_a == train_b) + + if subsample_test: + assert not np.all(test_a == test_b) + else: + assert np.all(test_a == test_b) + assert np.all(X[test_a] == X[test_b]) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 67da7d57b20fb..df0c1774a9964 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -557,7 +557,8 @@ def resample(*arrays, # convert sparse matrices to CSR for row-based indexing arrays = [a.tocsr() if issparse(a) else a for a in arrays] - resampled_arrays = [_safe_indexing(a, indices) for a in arrays] + resampled_arrays = [_safe_indexing(a, indices) if a is not None else a + for a in arrays] if len(resampled_arrays) == 1: # syntactic sugar for the unit argument case return resampled_arrays[0] From 88840a5a64116a4fa9c60ed560eb9309726e0b45 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Aug 2020 16:38:24 -0400 Subject: [PATCH 65/89] reorder parameters --- .../_search_successive_halving.py | 327 ++++++++---------- 1 file changed, 137 insertions(+), 190 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 32939d74f0940..c1fa8f78237a2 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -56,8 +56,7 @@ class BaseSuccessiveHalving(BaseSearchCV): Zohar Karnin, Tomer Koren, Oren Somekh """ def __init__(self, estimator, *, scoring=None, - n_jobs=None, refit=True, cv=5, verbose=0, - pre_dispatch='2*n_jobs', random_state=None, + n_jobs=None, refit=True, cv=5, verbose=0, random_state=None, error_score=np.nan, return_train_score=True, max_resources='auto', min_resources='exhaust', resource='n_samples', ratio=3, aggressive_elimination=False): @@ -65,7 +64,7 @@ def __init__(self, estimator, *, scoring=None, refit = _refit_callable if refit else False super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv, - verbose=verbose, pre_dispatch=pre_dispatch, + verbose=verbose, error_score=error_score, return_train_score=return_train_score) @@ -350,39 +349,54 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): in the list are explored. This enables searching over any sequence of parameter settings. - scoring : string, callable, or None, default=None - A single string (see :ref:`scoring_parameter`) or a callable - (see :ref:`scoring`) to evaluate the predictions on the test set. - If None, the estimator's score method is used. + ratio : int or float, default=3 + The 'halving' parameter, which determines the proportion of candidates + that are selected for each subsequent iteration. For example, + ``ratio=3`` means that only one third of the candidates are selected. - n_jobs : int or None, default=None - Number of jobs to run in parallel. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. + resource : ``'n_samples'`` or str, default='n_samples' + Defines the resource that increases with each iteration. By default, + the resource is the number of samples. It can also be set to any + parameter of the base estimator that accepts positive integer + values, e.g. 'n_iterations' or 'n_estimators' for a gradient + boosting estimator. In this case ``max_resources`` cannot be 'auto' + and must be set explicitly. - random_state : int, RandomState instance or None, default=None - Pseudo random number generator state used for subsampling the dataset - when `resources != 'n_samples'`. Ignored otherwise. - Pass an int for reproducible output across multiple function calls. - See :term:`Glossary `. + max_resources : int, default='auto' + The maximum amount of resource that any candidate is allowed to use + for a given iteration. By default, this is set to ``n_samples`` when + ``resource='n_samples'`` (default), else an error is raised. - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: + min_resources : {'exhaust', 'smallest'} or int, default='exhaust' + The minimum amount of resource that any candidate is allowed to use + for a given iteration. Equivalently, this defines the amount of + resources `r0` that are allocated for each candidate at the first + iteration. - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs + - 'smallest' is a heuristic that sets `r0` to a small value: + - ``n_splits * 2`` when ``resource='n_samples'`` for a regression + problem + - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a + regression problem + - ``1`` when ``resource != 'n_samples'`` + - 'exhaust' will set `r0` such that the **last** iteration uses as + much resources as possible. Namely, the last iteration will use the + highest value smaller than ``max_resources`` that is a multiple of + both ``min_resources`` and ``ratio``. In general, using 'exhaust' + leads to a more accurate estimator, but is slightly more time + consuming. - - An int, giving the exact number of total jobs that are - spawned + Note that the amount of resources used at each iteration is always a + multiple of ``min_resources``. - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' (default) + aggressive_elimination : bool, default=False + This is only relevant in cases where there isn't enough resources to + reduce the candidates to at most `ratio` in the last iteration. If + ``True``, then the search process will 'replay' the first iteration + for as long as needed until the number of candidates is small enough. + This is ``False`` by default, which means that the last iteration may + evaluate more than ``ratio`` candidates. See + :ref:`aggressive_elimination` for more details. cv : int, cross-validation generator or iterable, default=5 Determines the cross-validation splitting strategy. @@ -406,7 +420,12 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): deactivating shuffling (`shuffle=False`), or by setting the `cv`'s `random_state` parameter to an integer. - refit : boolean, default=True + scoring : string, callable, or None, default=None + A single string (see :ref:`scoring_parameter`) or a callable + (see :ref:`scoring`) to evaluate the predictions on the test set. + If None, the estimator's score method is used. + + refit : bool, default=True If True, refit an estimator using the best found parameters on the whole dataset. @@ -414,16 +433,13 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): attribute and permits using ``predict`` directly on this ``GridSearchCV`` instance. - verbose : integer - Controls the verbosity: the higher, the more messages. - error_score : 'raise' or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. Default is ``np.nan`` - return_train_score : boolean, default=False + return_train_score : bool, default=False If ``False``, the ``cv_results_`` attribute will not include training scores. Computing training scores is used to get insights on how different @@ -432,54 +448,20 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): expensive and is not strictly required to select the parameters that yield the best generalization performance. - max_resources : int, default='auto' - The maximum amount of resource that any candidate is allowed to use - for a given iteration. By default, this is set to ``n_samples`` when - ``resource='n_samples'`` (default), else an error is raised. - - min_resources : {'exhaust', 'smallest'} or int, default='exhaust' - The minimum amount of resource that any candidate is allowed to use - for a given iteration. Equivalently, this defines the amount of - resources `r0` that are allocated for each candidate at the first - iteration. - - - 'smallest' is a heuristic that sets `r0` to a small value: - - ``n_splits * 2`` when ``resource='n_samples'`` for a regression - problem - - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a - regression problem - - ``1`` when ``resource != 'n_samples'`` - - 'exhaust' will set `r0` such that the **last** iteration uses as - much resources as possible. Namely, the last iteration will use the - highest value smaller than ``max_resources`` that is a multiple of - both ``min_resources`` and ``ratio``. In general, using 'exhaust' - leads to a more accurate estimator, but is slightly more time - consuming. - - Note that the amount of resources used at each iteration is always a - multiple of ``min_resources``. - - resource : ``'n_samples'`` or str, default='n_samples' - Defines the resource that increases with each iteration. By default, - the resource is the number of samples. It can also be set to any - parameter of the base estimator that accepts positive integer - values, e.g. 'n_iterations' or 'n_estimators' for a gradient - boosting estimator. In this case ``max_resources`` cannot be 'auto' - and must be set explicitly. + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for subsampling the dataset + when `resources != 'n_samples'`. Ignored otherwise. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. - ratio : int or float, default=3 - The 'halving' parameter, which determines the proportion of candidates - that are selected for each subsequent iteration. For example, - ``ratio=3`` means that only one third of the candidates are selected. + n_jobs : int or None, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. - aggressive_elimination : bool, default=False - This is only relevant in cases where there isn't enough resources to - reduce the candidates to at most `ratio` in the last iteration. If - ``True``, then the search process will 'replay' the first iteration - for as long as needed until the number of candidates is small enough. - This is ``False`` by default, which means that the last iteration may - evaluate more than ``ratio`` candidates. See - :ref:`aggressive_elimination` for more details. + verbose : int + Controls the verbosity: the higher, the more messages. Attributes ---------- @@ -570,14 +552,6 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): The parameters selected are those that maximize the score of the held-out data, according to the scoring parameter. - If `n_jobs` was set to a value higher than one, the data is copied for each - parameter setting(and not `n_jobs` times). This is done for efficiency - reasons if individual jobs take very little time, but may raise errors if - the dataset is large and not enough memory is available. A workaround in - this case is to set `pre_dispatch`. Then, the memory is copied only - `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * - n_jobs`. - Examples -------- @@ -598,15 +572,14 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): """ _required_parameters = ["estimator", "param_grid"] - def __init__(self, estimator, param_grid, *, scoring=None, - n_jobs=None, refit=True, verbose=0, cv=5, - pre_dispatch='2*n_jobs', random_state=None, - error_score=np.nan, return_train_score=True, - max_resources='auto', min_resources='exhaust', - resource='n_samples', ratio=3, aggressive_elimination=False): + def __init__(self, estimator, param_grid, *, + ratio=3, resource='n_samples', max_resources='auto', + min_resources='exhaust', aggressive_elimination=False, + cv=5, scoring=None, refit=True, error_score=np.nan, + return_train_score=True, random_state=None, n_jobs=None, + verbose=0): super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, - pre_dispatch=pre_dispatch, random_state=random_state, error_score=error_score, return_train_score=return_train_score, max_resources=max_resources, resource=resource, @@ -651,41 +624,54 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): `min_resources`, `max_resources` and `ratio`. In this case, `min_resources` cannot be 'exhaust'. - scoring : string, callable, or None, default=None - A single string (see :ref:`scoring_parameter`) or a callable - (see :ref:`scoring`) to evaluate the predictions on the test set. - If None, the estimator's score method is used. + ratio : int or float, default=3 + The 'halving' parameter, which determines the proportion of candidates + that are selected for each subsequent iteration. For example, + ``ratio=3`` means that only one third of the candidates are selected. - n_jobs : int or None, default=None - Number of jobs to run in parallel. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. + resource : ``'n_samples'`` or str, default='n_samples' + Defines the resource that increases with each iteration. By default, + the resource is the number of samples. It can also be set to any + parameter of the base estimator that accepts positive integer + values, e.g. 'n_iterations' or 'n_estimators' for a gradient + boosting estimator. In this case ``max_resources`` cannot be 'auto' + and must be set explicitly. - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: + max_resources : int, default='auto' + The maximum number of resources that any candidate is allowed to use + for a given iteration. By default, this is set ``n_samples`` when + ``resource='n_samples'`` (default), else an error is raised. - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs + min_resources : {'exhaust', 'smallest'} or int, default='smallest' + The minimum amount of resource that any candidate is allowed to use + for a given iteration. Equivalently, this defines the amount of + resources `r0` that are allocated for each candidate at the first + iteration. - - An int, giving the exact number of total jobs that are - spawned + - 'smallest' is a heuristic that sets `r0` to a small value: + - ``n_splits * 2`` when ``resource='n_samples'`` for a regression + problem + - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a + regression problem + - ``1`` when ``resource != 'n_samples'`` + - 'exhaust' will set `r0` such that the **last** iteration uses as + much resources as possible. Namely, the last iteration will use the + highest value smaller than ``max_resources`` that is a multiple of + both ``min_resources`` and ``ratio``. In general, using 'exhaust' + leads to a more accurate estimator, but is slightly more time + consuming. 'exhaust' isn't available when `n_candidates='exhaust'`. - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' (default) + Note that the amount of resources used at each iteration is always a + multiple of ``min_resources``. - random_state : int, RandomState instance or None, default=None - Pseudo random number generator state used for subsampling the dataset - when `resources != 'n_samples'`. Also used for random uniform - sampling from lists of possible values instead of scipy.stats - distributions. - Pass an int for reproducible output across multiple function calls. - See :term:`Glossary `. + aggressive_elimination : bool, default=False + This is only relevant in cases where there isn't enough resources to + reduce the candidates to at most `ratio` in the last iteration. If + ``True``, then the search process will 'replay' the first iteration + for as long as needed until the number of candidates is small enough. + This is ``False`` by default, which means that the last iteration may + evaluate more than ``ratio`` candidates. See + :ref:`aggressive_elimination` for more details. cv : int, cross-validation generator or an iterable, default=5 Determines the cross-validation splitting strategy. @@ -709,7 +695,12 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): deactivating shuffling (`shuffle=False`), or by setting the `cv`'s `random_state` parameter to an integer. - refit : boolean, default=True + scoring : string, callable, or None, default=None + A single string (see :ref:`scoring_parameter`) or a callable + (see :ref:`scoring`) to evaluate the predictions on the test set. + If None, the estimator's score method is used. + + refit : bool, default=True If True, refit an estimator using the best found parameters on the whole dataset. @@ -717,16 +708,13 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): attribute and permits using ``predict`` directly on this ``GridSearchCV`` instance. - verbose : integer - Controls the verbosity: the higher, the more messages. - error_score : 'raise' or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. Default is ``np.nan`` - return_train_score : boolean, default=False + return_train_score : bool, default=False If ``False``, the ``cv_results_`` attribute will not include training scores. Computing training scores is used to get insights on how different @@ -735,54 +723,22 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): expensive and is not strictly required to select the parameters that yield the best generalization performance. - max_resources : int, default='auto' - The maximum number of resources that any candidate is allowed to use - for a given iteration. By default, this is set ``n_samples`` when - ``resource='n_samples'`` (default), else an error is raised. - - min_resources : {'exhaust', 'smallest'} or int, default='smallest' - The minimum amount of resource that any candidate is allowed to use - for a given iteration. Equivalently, this defines the amount of - resources `r0` that are allocated for each candidate at the first - iteration. - - - 'smallest' is a heuristic that sets `r0` to a small value: - - ``n_splits * 2`` when ``resource='n_samples'`` for a regression - problem - - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a - regression problem - - ``1`` when ``resource != 'n_samples'`` - - 'exhaust' will set `r0` such that the **last** iteration uses as - much resources as possible. Namely, the last iteration will use the - highest value smaller than ``max_resources`` that is a multiple of - both ``min_resources`` and ``ratio``. In general, using 'exhaust' - leads to a more accurate estimator, but is slightly more time - consuming. 'exhaust' isn't available when `n_candidates='exhaust'`. - - Note that the amount of resources used at each iteration is always a - multiple of ``min_resources``. - - resource : ``'n_samples'`` or str, default='n_samples' - Defines the resource that increases with each iteration. By default, - the resource is the number of samples. It can also be set to any - parameter of the base estimator that accepts positive integer - values, e.g. 'n_iterations' or 'n_estimators' for a gradient - boosting estimator. In this case ``max_resources`` cannot be 'auto' - and must be set explicitly. + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for subsampling the dataset + when `resources != 'n_samples'`. Also used for random uniform + sampling from lists of possible values instead of scipy.stats + distributions. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. - ratio : int or float, default=3 - The 'halving' parameter, which determines the proportion of candidates - that are selected for each subsequent iteration. For example, - ``ratio=3`` means that only one third of the candidates are selected. + n_jobs : int or None, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. - aggressive_elimination : bool, default=False - This is only relevant in cases where there isn't enough resources to - reduce the candidates to at most `ratio` in the last iteration. If - ``True``, then the search process will 'replay' the first iteration - for as long as needed until the number of candidates is small enough. - This is ``False`` by default, which means that the last iteration may - evaluate more than ``ratio`` candidates. See - :ref:`aggressive_elimination` for more details. + verbose : int + Controls the verbosity: the higher, the more messages. Attributes ---------- @@ -873,14 +829,6 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): The parameters selected are those that maximize the score of the held-out data, according to the scoring parameter. - If `n_jobs` was set to a value higher than one, the data is copied for each - parameter setting(and not `n_jobs` times). This is done for efficiency - reasons if individual jobs take very little time, but may raise errors if - the dataset is large and not enough memory is available. A workaround in - this case is to set `pre_dispatch`. Then, the memory is copied only - `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * - n_jobs`. - Examples -------- @@ -905,12 +853,11 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): _required_parameters = ["estimator", "param_distributions"] def __init__(self, estimator, param_distributions, *, - n_candidates='exhaust', scoring=None, n_jobs=None, - refit=True, verbose=0, cv=5, pre_dispatch='2*n_jobs', - random_state=None, error_score=np.nan, - return_train_score=True, max_resources='auto', - min_resources='smallest', resource='n_samples', ratio=3, - aggressive_elimination=False): + n_candidates='exhaust', ratio=3, resource='n_samples', + max_resources='auto', min_resources='smallest', + aggressive_elimination=False, cv=5, scoring=None, + refit=True, error_score=np.nan, return_train_score=True, + random_state=None, n_jobs=None, verbose=0): super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, random_state=random_state, error_score=error_score, From c9ec1c4b6ac1aa0329ec81d3af4a54c09f178f94 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Aug 2020 16:48:40 -0400 Subject: [PATCH 66/89] reduced diff --- sklearn/utils/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index df0c1774a9964..67da7d57b20fb 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -557,8 +557,7 @@ def resample(*arrays, # convert sparse matrices to CSR for row-based indexing arrays = [a.tocsr() if issparse(a) else a for a in arrays] - resampled_arrays = [_safe_indexing(a, indices) if a is not None else a - for a in arrays] + resampled_arrays = [_safe_indexing(a, indices) for a in arrays] if len(resampled_arrays) == 1: # syntactic sugar for the unit argument case return resampled_arrays[0] From b702abc332292e36d3b2209ec66ab137f1ae1a10 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Aug 2020 18:03:20 -0400 Subject: [PATCH 67/89] added tests for top_k --- doc/modules/grid_search.rst | 4 +- sklearn/model_selection/_search.py | 7 ++-- .../_search_successive_halving.py | 32 +++++++++------- .../tests/test_successive_halving.py | 38 ++++++++++++++++++- 4 files changed, 59 insertions(+), 22 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index a2a923e176805..480b350eadf47 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -348,9 +348,9 @@ terms of the number of estimators of a random forest:: >>> X, y = make_classification(n_samples=1000, random_state=0) >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, ... ratio=2, resource='n_estimators', - ... max_resources=30, random_state=0).fit(X, y) + ... max_resources=30).fit(X, y) >>> sh.best_estimator_ - RandomForestClassifier(max_depth=5, n_estimators=24, random_state=0) + RandomForestClassifier(max_depth=3, n_estimators=24, random_state=0) Note that it is not possible to budget on a parameter that is part of the parameter grid. diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 1b20ebecf1b52..2a855c6242054 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -722,8 +722,6 @@ def fit(self, X, y=None, *, groups=None, **fit_params): Parameters passed to the ``fit`` method of the estimator """ estimator = self.estimator - self._checked_cv_orig = check_cv(self.cv, y, - classifier=is_classifier(estimator)) refit_metric = "score" if callable(self.scoring): @@ -738,7 +736,8 @@ def fit(self, X, y=None, *, groups=None, **fit_params): X, y, groups = indexable(X, y, groups) fit_params = _check_fit_params(X, fit_params) - n_splits = self._checked_cv_orig.get_n_splits(X, y, groups) + cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator)) + n_splits = cv_orig.get_n_splits(X, y, groups) base_estimator = clone(self.estimator) @@ -761,7 +760,7 @@ def fit(self, X, y=None, *, groups=None, **fit_params): def evaluate_candidates(candidate_params, cv=None, more_results=None): - cv = cv or self._checked_cv_orig + cv = cv or cv_orig candidate_params = list(candidate_params) n_candidates = len(candidate_params) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index c1fa8f78237a2..3dc697fd0b6e1 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -48,6 +48,18 @@ def _refit_callable(results): return last_iter_indices[best_idx] +def _top_k(results, k, iter_i): + # Return the best candidates of a given iteration + iteration, mean_test_score, params = ( + np.asarray(a) for a in (results['iter'], + results['mean_test_score'], + results['params']) + ) + iter_indices = np.flatnonzero(iteration == iter_i) + sorted_indices = np.argsort(mean_test_score[iter_indices]) + return np.array(params[iter_indices][sorted_indices[-k:]]) + + class BaseSuccessiveHalving(BaseSearchCV): """Implements successive halving. @@ -85,8 +97,7 @@ def _check_input_parameters(self, X, y, groups): # We need to enforce that successive calls to cv.split() yield the same # splits: see https://github.com/scikit-learn/scikit-learn/issues/15149 - cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) - if not _yields_constant_splits(cv): + if not _yields_constant_splits(self._checked_cv_orig): raise ValueError( "The cv parameter must yield consistent folds across " "calls to split(). Set its random_state to an int, or set " @@ -141,7 +152,7 @@ def _check_input_parameters(self, X, y, groups): self.min_resources_ = self.min_resources if self.min_resources_ in ('smallest', 'exhaust'): if self.resource == 'n_samples': - n_splits = cv.get_n_splits(X, y, groups) + n_splits = self._checked_cv_orig.get_n_splits(X, y, groups) # please see https://gph.is/1KjihQe for a justification magic_factor = 2 self.min_resources_ = n_splits * magic_factor @@ -186,6 +197,9 @@ def fit(self, X, y=None, groups=None, **fit_params): **fit_params : dict of string -> object Parameters passed to the ``fit`` method of the estimator """ + self._checked_cv_orig = check_cv( + self.cv, y, classifier=is_classifier(self.estimator)) + self._check_input_parameters( X=X, y=y, @@ -304,23 +318,13 @@ def _run_search(self, evaluate_candidates): more_results=more_results) n_candidates_to_keep = ceil(n_candidates / self.ratio) - candidate_params = self._top_k(results, - n_candidates_to_keep, - iter_i) + candidate_params = _top_k(results, n_candidates_to_keep, iter_i) self.n_remaining_candidates_ = len(candidate_params) self.n_required_iterations_ = n_required_iterations self.n_possible_iterations_ = n_possible_iterations self.n_iterations_ = n_iterations - def _top_k(self, results, k, iter_i): - # Return the best candidates of a given iteration - # We need to filter out candidates from the previous iterations - # when sorting - iter_indices = np.flatnonzero(np.array(results['iter']) == iter_i) - sorted_indices = np.argsort(results['mean_test_score'][iter_indices]) - return np.array(results['params'])[sorted_indices[-k:]] - @abstractmethod def _generate_candidate_params(self): pass diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 5d8ce1234b897..250359d326e79 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -8,7 +8,7 @@ from sklearn.model_selection import HalvingRandomSearchCV from sklearn.model_selection import KFold, ShuffleSplit from sklearn.model_selection._search_successive_halving import ( - _SubsampleMetaSplitter) + _SubsampleMetaSplitter, _top_k, _refit_callable) class FastClassifier(DummyClassifier): @@ -341,7 +341,7 @@ def test_subsample_splitter_shapes(fraction, subsample_test, @pytest.mark.parametrize('subsample_test', (True, False)) def test_subsample_splitter_determinism(subsample_test): # Make sure _SubsampleMetaSplitter is consistent across calls to split(): - # - we're OK having training sets differ (they're always samples with a + # - we're OK having training sets differ (they're always sampled with a # different fraction anyway) # - when we don't subsample the test set, we want it to be always the same. # This check is the most important. This is ensured by the determinism @@ -367,3 +367,37 @@ def test_subsample_splitter_determinism(subsample_test): else: assert np.all(test_a == test_b) assert np.all(X[test_a] == X[test_b]) + + +@pytest.mark.parametrize('k, iter_i, expected', [ + (1, 0, ['c']), + (2, 0, ['a', 'c']), + (4, 0, ['d', 'b', 'a', 'c']), + (10, 0, ['d', 'b', 'a', 'c']), + + (1, 1, ['e']), + (2, 1, ['f', 'e']), + (10, 1, ['f', 'e']), + + (1, 2, ['i']), + (10, 2, ['g', 'h', 'i']), +]) +def test_top_k(k, iter_i, expected): + + results = { # this isn't a 'real world' result dict + 'iter': [0, 0, 0, 0, 1, 1, 2, 2, 2], + 'mean_test_score': [4, 3, 5, 1, 11, 10, 5, 6, 9], + 'params': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'], + } + got = _top_k(results, k=k, iter_i=iter_i) + assert np.all(got == expected) + + +def test_refit_callable(): + + results = { # this isn't a 'real world' result dict + 'iter': np.array([0, 0, 0, 0, 1, 1, 2, 2, 2]), + 'mean_test_score': np.array([4, 3, 5, 1, 11, 10, 5, 6, 9]), + 'params': np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']), + } + assert _refit_callable(results) == 8 # index of 'i' From 4c7a1b1bdb9508a1ab8f4dd32af82eba4d0fa3cb Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Aug 2020 18:04:48 -0400 Subject: [PATCH 68/89] put back doc for groups --- sklearn/model_selection/_search_successive_halving.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 3dc697fd0b6e1..a13925f19af6c 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -191,8 +191,10 @@ def fit(self, X, y=None, groups=None, **fit_params): Target relative to X for classification or regression; None for unsupervised learning. - groups : None - Groups are not supported + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). **fit_params : dict of string -> object Parameters passed to the ``fit`` method of the estimator From 79cac35d39eefe7af0219bc3474d0d9e28ccc53a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Aug 2020 18:35:11 -0400 Subject: [PATCH 69/89] not sure what went wrong --- doc/modules/grid_search.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 480b350eadf47..78734ff489531 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -350,7 +350,7 @@ terms of the number of estimators of a random forest:: ... ratio=2, resource='n_estimators', ... max_resources=30).fit(X, y) >>> sh.best_estimator_ - RandomForestClassifier(max_depth=3, n_estimators=24, random_state=0) + RandomForestClassifier(max_depth=5, n_estimators=24, random_state=0) Note that it is not possible to budget on a parameter that is part of the parameter grid. From 7c55a29228d106277c270b97916f671e397fbd63 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 23 Aug 2020 09:23:49 -0400 Subject: [PATCH 70/89] put import at its place --- sklearn/model_selection/tests/test_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index f3eb75a47f01e..5d91a505238ef 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -43,6 +43,7 @@ from sklearn.model_selection._split import _validate_shuffle_split from sklearn.model_selection._split import _build_repr +from sklearn.model_selection._split import _yields_constant_splits from sklearn.datasets import load_digits from sklearn.datasets import make_classification @@ -1656,5 +1657,4 @@ def test_random_state_shuffle_false(Klass): (StratifiedShuffleSplit(random_state=np.random.RandomState(0)), False), ]) def test_yields_constant_splits(cv, expected): - from sklearn.model_selection._split import _yields_constant_splits assert _yields_constant_splits(cv) == expected From 72ae48263f1928027f7ba66f46cb6042045dfbb3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 23 Aug 2020 09:27:56 -0400 Subject: [PATCH 71/89] some comment --- sklearn/model_selection/_split.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 9fb69603e5d59..8cbecb1340197 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -2238,7 +2238,11 @@ def _build_repr(self): def _yields_constant_splits(cv): - # return True if calling cv.split() always returns the same splits + # Return True if calling cv.split() always returns the same splits + # We assume that if a cv doesn't have a shuffle parameter, it shuffles by + # default (e.g. ShuffleSplit). If it actually doesn't shuffle (e.g. + # LeaveOneOut), then it won't have a random_state parameter anyway, in + # which case it will default to 0, leading to output=True shuffle = getattr(cv, 'shuffle', True) random_state = getattr(cv, 'random_state', 0) return isinstance(random_state, numbers.Integral) or not shuffle From a68bac41944bdc5f79feb2d780d05938a754efc0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 30 Aug 2020 11:00:45 -0400 Subject: [PATCH 72/89] Addressed comments --- .../_search_successive_halving.py | 36 +++++------ .../tests/test_successive_halving.py | 59 +++++++++++-------- 2 files changed, 52 insertions(+), 43 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index a13925f19af6c..0af845ef393c1 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -397,11 +397,11 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough resources to - reduce the candidates to at most `ratio` in the last iteration. If - ``True``, then the search process will 'replay' the first iteration - for as long as needed until the number of candidates is small enough. - This is ``False`` by default, which means that the last iteration may - evaluate more than ``ratio`` candidates. See + reduce the remaining candidates to at most `ratio` after the last + iteration. If ``True``, then the search process will 'replay' the + first iteration for as long as needed until the number of candidates + is small enough. This is ``False`` by default, which means that the + last iteration may evaluate more than ``ratio`` candidates. See :ref:`aggressive_elimination` for more details. cv : int, cross-validation generator or iterable, default=5 @@ -471,16 +471,16 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): Attributes ---------- + n_resources_ : list of int + The amount of resources used at each iteration. + n_candidates_ : list of int The number of candidate parameters that were evaluated at each iteration. - n_resources_ : list of int - The amount of resources used at each iteration. - n_remaining_candidates_ : int The number of candidate parameters that are left after the last - iteration. + iteration. It corresponds to `ceil(n_candidates[-1] / ratio)` max_resources_ : int The maximum number of resources that any candidate is allowed to use @@ -672,11 +672,11 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough resources to - reduce the candidates to at most `ratio` in the last iteration. If - ``True``, then the search process will 'replay' the first iteration - for as long as needed until the number of candidates is small enough. - This is ``False`` by default, which means that the last iteration may - evaluate more than ``ratio`` candidates. See + reduce the remaining candidates to at most `ratio` after the last + iteration. If ``True``, then the search process will 'replay' the + first iteration for as long as needed until the number of candidates + is small enough. This is ``False`` by default, which means that the + last iteration may evaluate more than ``ratio`` candidates. See :ref:`aggressive_elimination` for more details. cv : int, cross-validation generator or an iterable, default=5 @@ -748,16 +748,16 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): Attributes ---------- + n_resources_ : list of int + The amount of resources used at each iteration. + n_candidates_ : list of int The number of candidate parameters that were evaluated at each iteration. - n_resources_ : list of int - The amount of resources used at each iteration. - n_remaining_candidates_ : int The number of candidate parameters that are left after the last - iteration. + iteration. It corresponds to `ceil(n_candidates[-1] / ratio)` max_resources_ : int The maximum number of resources that any candidate is allowed to use diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 250359d326e79..be4dec720d067 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -1,3 +1,5 @@ +from math import ceil + import pytest from scipy.stats import norm, randint import numpy as np @@ -37,28 +39,33 @@ def get_params(self, deep=False): 'expected_n_required_iterations,' 'expected_n_possible_iterations,' 'expected_n_remaining_candidates,' + 'expected_n_candidates,' 'expected_n_resources,'), [ # notice how it loops at the beginning - (True, 'limited', 4, 4, 3, 1, [20, 20, 60, 180]), - # no aggressive elimination: we end up with less iterations and more - # candidates at the end - (False, 'limited', 3, 4, 3, 3, [20, 60, 180]), - # When the amount of resource isn't limited, aggressive_elimination - # has no effect. Here the default min_resources='exhaust' will take - # over. - (True, 'unlimited', 4, 4, 4, 1, [37, 111, 333, 999]), - (False, 'unlimited', 4, 4, 4, 1, [37, 111, 333, 999]), + # also, the number of candidates evaluated at the last iteration is + # <= ratio + (True, 'limited', 4, 4, 3, 1, [60, 20, 7, 3], [20, 20, 60, 180]), + # no aggressive elimination: we end up with less iterations, and + # the number of candidates at the last iter is > ratio, which isn't + # ideal + (False, 'limited', 3, 4, 3, 3, [60, 20, 7], [20, 60, 180]), + # # When the amount of resource isn't limited, aggressive_elimination + # # has no effect. Here the default min_resources='exhaust' will take + # # over. + (True, 'unlimited', 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]), + (False, 'unlimited', 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]), ] ) def test_aggressive_elimination( Est, aggressive_elimination, max_resources, expected_n_iterations, expected_n_required_iterations, expected_n_possible_iterations, - expected_n_remaining_candidates, expected_n_resources): + expected_n_remaining_candidates, expected_n_candidates, + expected_n_resources): # Test the aggressive_elimination parameter. n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) - parameters = {'a': ('l1', 'l2'), 'b': list(range(30))} + param_grid = {'a': ('l1', 'l2'), 'b': list(range(30))} base_estimator = FastClassifier() if max_resources == 'limited': @@ -66,10 +73,10 @@ def test_aggressive_elimination( else: max_resources = n_samples - sh = Est(base_estimator, parameters, + sh = Est(base_estimator, param_grid, aggressive_elimination=aggressive_elimination, - max_resources=max_resources, ratio=3, - verbose=True) # just for test coverage + max_resources=max_resources, ratio=3) + sh.set_params(verbose=True) # just for test coverage if Est is HalvingRandomSearchCV: # same number of candidates as with the grid @@ -81,7 +88,9 @@ def test_aggressive_elimination( assert sh.n_required_iterations_ == expected_n_required_iterations assert sh.n_possible_iterations_ == expected_n_possible_iterations assert sh.n_resources_ == expected_n_resources + assert sh.n_candidates_ == expected_n_candidates assert sh.n_remaining_candidates_ == expected_n_remaining_candidates + assert ceil(sh.n_candidates_[-1] / sh.ratio) == sh.n_remaining_candidates_ @pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV)) @@ -117,10 +126,10 @@ def test_min_max_resources( # the number of resources used at each iteration n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) - parameters = {'a': [1, 2], 'b': [1, 2, 3]} + param_grid = {'a': [1, 2], 'b': [1, 2, 3]} base_estimator = FastClassifier() - sh = Est(base_estimator, parameters, ratio=3, min_resources=min_resources, + sh = Est(base_estimator, param_grid, ratio=3, min_resources=min_resources, max_resources=max_resources) if Est is HalvingRandomSearchCV: sh.set_params(n_candidates=6) # same number as with the grid @@ -157,11 +166,11 @@ def test_n_iterations(Est, max_resources, n_iterations, n_possible_iterations): n_samples = 1024 X, y = make_classification(n_samples=n_samples, random_state=1) - parameters = {'a': [1, 2], 'b': list(range(10))} + param_grid = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() ratio = 2 - sh = Est(base_estimator, parameters, cv=2, ratio=ratio, + sh = Est(base_estimator, param_grid, cv=2, ratio=ratio, max_resources=max_resources, min_resources=4) if Est is HalvingRandomSearchCV: sh.set_params(n_candidates=20) # same as for HalvingGridSearchCV @@ -177,9 +186,9 @@ def test_resource_parameter(Est): n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) - parameters = {'a': [1, 2], 'b': list(range(10))} + param_grid = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() - sh = Est(base_estimator, parameters, cv=2, resource='c', + sh = Est(base_estimator, param_grid, cv=2, resource='c', max_resources=10, ratio=3) sh.fit(X, y) assert set(sh.n_resources_) == set([1, 3, 9]) @@ -191,7 +200,7 @@ def test_resource_parameter(Est): with pytest.raises( ValueError, match='Cannot use resource=1234 which is not supported '): - sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, + sh = HalvingGridSearchCV(base_estimator, param_grid, cv=2, resource='1234', max_resources=10) sh.fit(X, y) @@ -199,8 +208,8 @@ def test_resource_parameter(Est): ValueError, match='Cannot use parameter c as the resource since it is part ' 'of the searched parameters.'): - parameters = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]} - sh = HalvingGridSearchCV(base_estimator, parameters, cv=2, + param_grid = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]} + sh = HalvingGridSearchCV(base_estimator, param_grid, cv=2, resource='c', max_resources=10) sh.fit(X, y) @@ -219,9 +228,9 @@ def test_random_search(max_resources, n_candidates, expected_n_candidates): n_samples = 1024 X, y = make_classification(n_samples=n_samples, random_state=0) - parameters = {'a': norm, 'b': norm} + param_grid = {'a': norm, 'b': norm} base_estimator = FastClassifier() - sh = HalvingRandomSearchCV(base_estimator, parameters, + sh = HalvingRandomSearchCV(base_estimator, param_grid, n_candidates=n_candidates, cv=2, max_resources=max_resources, ratio=2, min_resources=4) From 5bf1586c33e4f59c613821b1d41d307d5352cc02 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 30 Aug 2020 14:14:25 -0400 Subject: [PATCH 73/89] Added tests for cv_results_ and base estimator inputs --- .../tests/test_successive_halving.py | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index be4dec720d067..9bb9e5d58a0d2 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -4,6 +4,7 @@ from scipy.stats import norm, randint import numpy as np +from sklearn.model_selection._validation import _fit_and_score from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier from sklearn.model_selection import HalvingGridSearchCV @@ -410,3 +411,145 @@ def test_refit_callable(): 'params': np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']), } assert _refit_callable(results) == 8 # index of 'i' + + +@pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) +def test_cv_results(monkeypatch, Est): + # test that the cv_results_ matches correctly the logic of the + # tournament: in particular that the candidates continued in each + # successive iteration are those that were best in the previous iteration + pd = pytest.importorskip('pandas') + + rng = np.random.RandomState(0) + + def fit_and_score_mock(*args, **kwargs): + # generate random scores: we want to avoid ties, which would otherwise + # mess with the ordering and with our checks + out = _fit_and_score(*args, **kwargs) + out['test_scores'] = rng.rand() + return out + monkeypatch.setattr("sklearn.model_selection._search._fit_and_score", + fit_and_score_mock) + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {'a': ('l1', 'l2'), 'b': list(range(30))} + base_estimator = FastClassifier() + + sh = Est(base_estimator, param_grid, ratio=2) + if Est is HalvingRandomSearchCV: + # same number of candidates as with the grid + sh.set_params(n_candidates=2 * 30, min_resources='exhaust') + + sh.fit(X, y) + df = pd.DataFrame(sh.cv_results_) + + # just make sure we don't have ties + assert len(df['mean_test_score'].unique()) == len(df) + + df['params_str'] = df['params'].apply(str) + table = df.pivot(index='params_str', columns='iter', + values='mean_test_score') + + # table looks like something like this: + # iter 0 1 2 3 4 5 + # params_str + # {'a': 'l2', 'b': 23} 0.75 NaN NaN NaN NaN NaN + # {'a': 'l1', 'b': 30} 0.90 0.875 NaN NaN NaN NaN + # {'a': 'l1', 'b': 0} 0.75 NaN NaN NaN NaN NaN + # {'a': 'l2', 'b': 3} 0.85 0.925 0.9125 0.90625 NaN NaN + # {'a': 'l1', 'b': 5} 0.80 NaN NaN NaN NaN NaN + # ... + + # where a NaN indicates that the candidate wasn't evaluated at a given + # iteration, because it wasn't part of the top-K at some previous + # iteration. We here make sure that candidates that aren't in the top-k at + # any given iteration are indeed not evaluated at the subsequent + # iterations. + + n_iter = sh.n_iterations_ + for it in range(n_iter - 1): + n_selected = sh.n_candidates_[it + 1] + table = table.sort_values(by=it) + not_selected = table[:-n_selected] + table = table[-n_selected:] + + assert not_selected[range(it + 1, n_iter)].isna().all(axis=None) + + # We now make sure that the best candidate is chosen only from the last + # iteration. + # We also make sure this is true even if there were higher scores in + # earlier rounds (this isn't generally the case, but worth ensuring it's + # possible). + + last_iter = df['iter'].max() + idx_best_last_iter = ( + df[df['iter'] == last_iter]['mean_test_score'].idxmax() + ) + idx_best_all_iters = df['mean_test_score'].idxmax() + + assert sh.best_params_ == df.iloc[idx_best_last_iter]['params'] + assert (df.iloc[idx_best_last_iter]['mean_test_score'] < + df.iloc[idx_best_all_iters]['mean_test_score']) + assert (df.iloc[idx_best_last_iter]['params'] != + df.iloc[idx_best_all_iters]['params']) + + +@pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV)) +def test_base_estimator_inputs(Est): + # make sure that the base estimators are passed the correct parameters and + # number of samples at each iteration. + pd = pytest.importorskip('pandas') + + passed_n_samples_fit = [] + passed_n_samples_predict = [] + passed_params = [] + class FastClassifierBookKeeping(FastClassifier): + + def fit(self, X, y): + passed_n_samples_fit.append(X.shape[0]) + return super().fit(X, y) + + def predict(self, X): + passed_n_samples_predict.append(X.shape[0]) + return super().predict(X) + + def set_params(self, **params): + passed_params.append(params) + return super().set_params(**params) + + n_samples = 1024 + n_splits = 2 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {'a': ('l1', 'l2'), 'b': list(range(30))} + base_estimator = FastClassifierBookKeeping() + + sh = Est(base_estimator, param_grid, ratio=2, cv=n_splits, + return_train_score=False, refit=False) + if Est is HalvingRandomSearchCV: + # same number of candidates as with the grid + sh.set_params(n_candidates=2 * 30, min_resources='exhaust') + + sh.fit(X, y) + + assert len(passed_n_samples_fit) == len(passed_n_samples_predict) + passed_n_samples = [x + y for (x, y) in zip(passed_n_samples_fit, + passed_n_samples_predict)] + + # Lists are of length n_splits * n_iter * n_candidates_at_i. + # Each chunk of size n_splits corresponds to the n_splits folds for the + # same candidate at the same iteration, so they contain equal values. We + # subsample such that the lists are of length n_iter * n_candidates_at_it + passed_n_samples = passed_n_samples[::n_splits] + passed_params = passed_params[::n_splits] + + df = pd.DataFrame(sh.cv_results_) + + assert len(passed_params) == len(passed_n_samples) == len(df) + + uniques, counts = np.unique(passed_n_samples, return_counts=True) + assert (sh.n_resources_ == uniques).all() + assert (sh.n_candidates_ == counts).all() + + assert (df['params'] == passed_params).all() + assert (df['resource_iter'] == passed_n_samples).all() From ee4724b1b05d6e2d6003fee17f908f58aadbce65 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 30 Aug 2020 14:17:55 -0400 Subject: [PATCH 74/89] pep8 --- sklearn/model_selection/tests/test_successive_halving.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 9bb9e5d58a0d2..b6f86b6a98d2f 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -504,6 +504,7 @@ def test_base_estimator_inputs(Est): passed_n_samples_fit = [] passed_n_samples_predict = [] passed_params = [] + class FastClassifierBookKeeping(FastClassifier): def fit(self, X, y): From d8849f5e3ad0b93fea3c9dee67b1c8b65dff5180 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 31 Aug 2020 10:56:01 -0400 Subject: [PATCH 75/89] avoid monkeypatching --- .../tests/test_successive_halving.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index b6f86b6a98d2f..30bd531ae5d62 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -4,7 +4,6 @@ from scipy.stats import norm, randint import numpy as np -from sklearn.model_selection._validation import _fit_and_score from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier from sklearn.model_selection import HalvingGridSearchCV @@ -414,7 +413,7 @@ def test_refit_callable(): @pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV)) -def test_cv_results(monkeypatch, Est): +def test_cv_results(Est): # test that the cv_results_ matches correctly the logic of the # tournament: in particular that the candidates continued in each # successive iteration are those that were best in the previous iteration @@ -422,21 +421,17 @@ def test_cv_results(monkeypatch, Est): rng = np.random.RandomState(0) - def fit_and_score_mock(*args, **kwargs): - # generate random scores: we want to avoid ties, which would otherwise - # mess with the ordering and with our checks - out = _fit_and_score(*args, **kwargs) - out['test_scores'] = rng.rand() - return out - monkeypatch.setattr("sklearn.model_selection._search._fit_and_score", - fit_and_score_mock) - n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) param_grid = {'a': ('l1', 'l2'), 'b': list(range(30))} base_estimator = FastClassifier() - sh = Est(base_estimator, param_grid, ratio=2) + # generate random scores: we want to avoid ties, which would otherwise + # mess with the ordering and make testing harder + def scorer(est, X, y): + return rng.rand() + + sh = Est(base_estimator, param_grid, ratio=2, scoring=scorer) if Est is HalvingRandomSearchCV: # same number of candidates as with the grid sh.set_params(n_candidates=2 * 30, min_resources='exhaust') From be849cbb0b6b2245b1aa84385799f1f449400bc5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 31 Aug 2020 10:58:48 -0400 Subject: [PATCH 76/89] rename df --- .../tests/test_successive_halving.py | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 30bd531ae5d62..4aa1845a66b64 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -437,14 +437,14 @@ def scorer(est, X, y): sh.set_params(n_candidates=2 * 30, min_resources='exhaust') sh.fit(X, y) - df = pd.DataFrame(sh.cv_results_) + cv_results_df = pd.DataFrame(sh.cv_results_) # just make sure we don't have ties - assert len(df['mean_test_score'].unique()) == len(df) + assert len(cv_results_df['mean_test_score'].unique()) == len(cv_results_df) - df['params_str'] = df['params'].apply(str) - table = df.pivot(index='params_str', columns='iter', - values='mean_test_score') + cv_results_df['params_str'] = cv_results_df['params'].apply(str) + table = cv_results_df.pivot(index='params_str', columns='iter', + values='mean_test_score') # table looks like something like this: # iter 0 1 2 3 4 5 @@ -477,17 +477,18 @@ def scorer(est, X, y): # earlier rounds (this isn't generally the case, but worth ensuring it's # possible). - last_iter = df['iter'].max() + last_iter = cv_results_df['iter'].max() idx_best_last_iter = ( - df[df['iter'] == last_iter]['mean_test_score'].idxmax() + cv_results_df[cv_results_df['iter'] == last_iter] + ['mean_test_score'].idxmax() ) - idx_best_all_iters = df['mean_test_score'].idxmax() + idx_best_all_iters = cv_results_df['mean_test_score'].idxmax() - assert sh.best_params_ == df.iloc[idx_best_last_iter]['params'] - assert (df.iloc[idx_best_last_iter]['mean_test_score'] < - df.iloc[idx_best_all_iters]['mean_test_score']) - assert (df.iloc[idx_best_last_iter]['params'] != - df.iloc[idx_best_all_iters]['params']) + assert sh.best_params_ == cv_results_df.iloc[idx_best_last_iter]['params'] + assert (cv_results_df.iloc[idx_best_last_iter]['mean_test_score'] < + cv_results_df.iloc[idx_best_all_iters]['mean_test_score']) + assert (cv_results_df.iloc[idx_best_last_iter]['params'] != + cv_results_df.iloc[idx_best_all_iters]['params']) @pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV)) @@ -539,13 +540,13 @@ def set_params(self, **params): passed_n_samples = passed_n_samples[::n_splits] passed_params = passed_params[::n_splits] - df = pd.DataFrame(sh.cv_results_) + cv_results_df = pd.DataFrame(sh.cv_results_) - assert len(passed_params) == len(passed_n_samples) == len(df) + assert len(passed_params) == len(passed_n_samples) == len(cv_results_df) uniques, counts = np.unique(passed_n_samples, return_counts=True) assert (sh.n_resources_ == uniques).all() assert (sh.n_candidates_ == counts).all() - assert (df['params'] == passed_params).all() - assert (df['resource_iter'] == passed_n_samples).all() + assert (cv_results_df['params'] == passed_params).all() + assert (cv_results_df['resource_iter'] == passed_n_samples).all() From 0064d49fc5154924a04867797f43cebdd9769e0a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 31 Aug 2020 11:56:17 -0400 Subject: [PATCH 77/89] use Joel's suggestions for testing masks --- .../tests/test_successive_halving.py | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 4aa1845a66b64..04b2d49fe2837 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -461,15 +461,26 @@ def scorer(est, X, y): # iteration. We here make sure that candidates that aren't in the top-k at # any given iteration are indeed not evaluated at the subsequent # iterations. - + nan_mask = pd.isna(table) n_iter = sh.n_iterations_ for it in range(n_iter - 1): - n_selected = sh.n_candidates_[it + 1] - table = table.sort_values(by=it) - not_selected = table[:-n_selected] - table = table[-n_selected:] - - assert not_selected[range(it + 1, n_iter)].isna().all(axis=None) + already_discarded_mask = nan_mask[it] + + # make sure that if a candidate is already discarded, we don't evaluate + # it later + assert (already_discarded_mask & nan_mask[it + 1] == + already_discarded_mask).all() + + # make sure that the number of discarded candidate is correct + discarded_now_mask = ~already_discarded_mask & nan_mask[it + 1] + kept_mask = ~already_discarded_mask & ~discarded_now_mask + assert kept_mask.sum() == sh.n_candidates_[it + 1] + + # make sure that all discarded candidates have a lower score than the + # kept candidates + discarded_max_score = table[it].where(discarded_now_mask).max() + kept_min_score = table[it].where(kept_mask).min() + assert discarded_max_score < kept_min_score # We now make sure that the best candidate is chosen only from the last # iteration. From af5a809bec0d4c7f8939b61c5033845f0851ce81 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 31 Aug 2020 12:37:12 -0400 Subject: [PATCH 78/89] Made it experimental --- doc/modules/grid_search.rst | 12 ++++++ .../experimental/enable_successive_halving.py | 35 +++++++++++++++ .../tests/test_enable_successive_halving.py | 43 +++++++++++++++++++ sklearn/model_selection/__init__.py | 17 +++++--- .../_search_successive_halving.py | 24 +++++++++++ .../tests/test_successive_halving.py | 1 + 6 files changed, 126 insertions(+), 6 deletions(-) create mode 100644 sklearn/experimental/enable_successive_halving.py create mode 100644 sklearn/experimental/tests/test_enable_successive_halving.py diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 78734ff489531..b574c7c7f4f83 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -210,6 +210,15 @@ resources is small. More control is available through tuning the ``min_resources`` parameter. Each parameter and their interactions are described in more details below. +These estimators are still **experimental**: their predictions +and their API might change without any deprecation cycle. To use them, you +need to explicitly import ``enable_successive_halving``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_successive_halving # noqa + >>> # now you can import normally from model_selection + >>> from sklearn.model_selection import HalvingGridSearchCV + .. topic:: Examples: * :ref:`sphx_glr_auto_examples_model_selection_plot_successive_halving_heatmap.py` @@ -339,6 +348,7 @@ terms of the number of estimators of a random forest:: >>> from sklearn.datasets import make_classification >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.experimental import enable_successive_halving # noqa >>> from sklearn.model_selection import HalvingGridSearchCV >>> import pandas as pd >>> @@ -367,6 +377,7 @@ resources, some of them might be wasted (i.e. not used):: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC + >>> from sklearn.experimental import enable_successive_halving # noqa >>> from sklearn.model_selection import HalvingGridSearchCV >>> import pandas as pd >>> param_grid= {'kernel': ('linear', 'rbf'), @@ -423,6 +434,7 @@ more than ``ratio`` candidates:: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC + >>> from sklearn.experimental import enable_successive_halving # noqa >>> from sklearn.model_selection import HalvingGridSearchCV >>> import pandas as pd >>> diff --git a/sklearn/experimental/enable_successive_halving.py b/sklearn/experimental/enable_successive_halving.py new file mode 100644 index 0000000000000..147a622d4fdae --- /dev/null +++ b/sklearn/experimental/enable_successive_halving.py @@ -0,0 +1,35 @@ +"""Enables Successive Halving search-estimators + +The API and results of these estimators might change without any deprecation +cycle. + +Importing this file dynamically sets the +:class:`~sklearn.model_selection.HalvingRandomSearchCV` and +:class:`~sklearn.model_selection.HalvingGridSearchCV` as attributes of the +`model_selection` module:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_successive_halving # noqa + >>> # now you can import normally from model_selection + >>> from sklearn.model_selection import HalvingRandomSearchCV + >>> from sklearn.model_selection import HalvingGridSearchCV + + +The ``# noqa`` comment comment can be removed: it just tells linters like +flake8 to ignore the import, which appears as unused. +""" + +from ..model_selection._search_successive_halving import ( + HalvingRandomSearchCV, + HalvingGridSearchCV +) + +from .. import model_selection + +# use settattr to avoid mypy errors when monkeypatching +setattr(model_selection, "HalvingRandomSearchCV", + HalvingRandomSearchCV) +setattr(model_selection, "HalvingGridSearchCV", + HalvingGridSearchCV) + +model_selection.__all__ += ['HalvingRandomSearchCV', 'HalvingGridSearchCV'] diff --git a/sklearn/experimental/tests/test_enable_successive_halving.py b/sklearn/experimental/tests/test_enable_successive_halving.py new file mode 100644 index 0000000000000..bfd05bc302c79 --- /dev/null +++ b/sklearn/experimental/tests/test_enable_successive_halving.py @@ -0,0 +1,43 @@ +"""Tests for making sure experimental imports work as expected.""" + +import textwrap + +from sklearn.utils._testing import assert_run_python_script + + +def test_imports_strategies(): + # Make sure different import strategies work or fail as expected. + + # Since Python caches the imported modules, we need to run a child process + # for every test case. Else, the tests would not be independent + # (manually removing the imports from the cache (sys.modules) is not + # recommended and can lead to many complications). + + good_import = """ + from sklearn.experimental import enable_successive_halving + from sklearn.model_selection import HalvingGridSearchCV + from sklearn.model_selection import HalvingRandomSearchCV + """ + assert_run_python_script(textwrap.dedent(good_import)) + + good_import_with_model_selection_first = """ + import sklearn.model_selection + from sklearn.experimental import enable_successive_halving + from sklearn.model_selection import HalvingGridSearchCV + from sklearn.model_selection import HalvingRandomSearchCV + """ + assert_run_python_script( + textwrap.dedent(good_import_with_model_selection_first) + ) + + bad_imports = """ + import pytest + + with pytest.raises(ImportError): + from sklearn.model_selection import HalvingGridSearchCV + + import sklearn.experimental + with pytest.raises(ImportError): + from sklearn.model_selection import HalvingGridSearchCV + """ + assert_run_python_script(textwrap.dedent(bad_imports)) diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index 70b671c446d72..897183414b5a6 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -1,3 +1,5 @@ +import typing + from ._split import BaseCrossValidator from ._split import KFold from ._split import GroupKFold @@ -29,12 +31,16 @@ from ._search import ParameterSampler from ._search import fit_grid_point -from ._search_successive_halving import HalvingGridSearchCV -from ._search_successive_halving import HalvingRandomSearchCV +if typing.TYPE_CHECKING: + # Avoid errors in type checkers (e.g. mypy) for experimental estimators. + # TODO: remove this check once the estimator is no longer experimental. + from ._search_successive_halving import ( # noqa + HalvingGridSearchCV, HalvingRandomSearchCV + ) + -__all__ = ('BaseCrossValidator', +__all__ = ['BaseCrossValidator', 'GridSearchCV', - 'HalvingGridSearchCV', 'TimeSeriesSplit', 'KFold', 'GroupKFold', @@ -49,7 +55,6 @@ 'ParameterSampler', 'PredefinedSplit', 'RandomizedSearchCV', - 'HalvingRandomSearchCV', 'ShuffleSplit', 'StratifiedKFold', 'StratifiedShuffleSplit', @@ -61,4 +66,4 @@ 'learning_curve', 'permutation_test_score', 'train_test_split', - 'validation_curve') + 'validation_curve'] diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 0af845ef393c1..031dae4851db1 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -341,6 +341,17 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): Read more in the :ref:`User guide `. + .. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_successive_halving``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_successive_halving # noqa + >>> # now you can import normally from model_selection + >>> from sklearn.model_selection import HalvingGridSearchCV + Parameters ---------- estimator : estimator object. @@ -563,6 +574,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): >>> from sklearn.datasets import load_iris >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.experimental import enable_successive_halving # noqa >>> from sklearn.model_selection import HalvingGridSearchCV ... >>> X, y = load_iris(return_X_y=True) @@ -610,6 +622,17 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): Read more in the :ref:`User guide`. + .. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_successive_halving``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_successive_halving # noqa + >>> # now you can import normally from model_selection + >>> from sklearn.model_selection import HalvingRandomSearchCV + Parameters ---------- estimator : estimator object. @@ -840,6 +863,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): >>> from sklearn.datasets import load_iris >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.experimental import enable_successive_halving # noqa >>> from sklearn.model_selection import HalvingRandomSearchCV >>> from scipy.stats import randint ... diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 04b2d49fe2837..9d377b6b9ba87 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -6,6 +6,7 @@ from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier +from sklearn.experimental import enable_successive_halving # noqa from sklearn.model_selection import HalvingGridSearchCV from sklearn.model_selection import HalvingRandomSearchCV from sklearn.model_selection import KFold, ShuffleSplit From 2b3967789e4b77e2aa976a7b90ea0746bd5f89ce Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 31 Aug 2020 13:01:32 -0400 Subject: [PATCH 79/89] Should fix docs --- doc/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/conf.py b/doc/conf.py index ccf5dcd068131..b09c5a15b133d 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -356,6 +356,7 @@ def __call__(self, directory): # discovered properly by sphinx from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.experimental import enable_iterative_imputer # noqa +from sklearn.experimental import enable_successive_halving # noqa def make_carousel_thumbs(app, exception): From 46afbca897d94c0c25b48d9315b9ee62e98a2b34 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 2 Sep 2020 12:44:02 -0400 Subject: [PATCH 80/89] whats new entry --- doc/whats_new/v0.24.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 15e901218eeb7..baecde78dc5c6 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -356,6 +356,14 @@ Changelog :pr:`17478` by :user:`Teon Brooks ` and :user:`Mohamed Maskani `. +- |Feature| Added (experimental) parameter search estimators + :class:`model_selection.HalvingRandomSearchCV` and + :class:`model_selection.HalvingGridSearchCV` which implement Successive + Halving, and can be used as a drop-in replacements for + :class:`model_selection.RandomizedSearchCV` and + :class:`model_selection.GridSearchCV`. :pr:`13900` by `Nicolas Hug`_, `Joel + Nothman`_ and `Andreas Müller`_. + - |Fix| Fixed the `len` of :class:`model_selection.ParameterSampler` when all distributions are lists and `n_iter` is more than the number of unique parameter combinations. :pr:`18222` by `Nicolas Hug`_. From d8c25196c23eb93a08a78667b76c1a8fd60b4d49 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 5 Sep 2020 12:48:32 -0400 Subject: [PATCH 81/89] Apply suggestions from code review Co-authored-by: Andreas Mueller --- doc/modules/grid_search.rst | 6 +++--- sklearn/model_selection/_search.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index b574c7c7f4f83..1ebe75096552d 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -38,7 +38,7 @@ distribution. Both these tools have successive halving counterparts :class:`HalvingGridSearchCV` and :class:`HalvingRandomSearchCV`, which can be much faster at finding a good parameter combination. -After describing these tools we detail, :ref:`best practices +After describing these tools we detail :ref:`best practices ` applicable to these approaches. Some models allow for specialized, efficient parameter search strategies, outlined in :ref:`alternative_cv`. @@ -202,7 +202,7 @@ the rate at which the number of candidates decreases. In each iteration, the number of resources per candidate is multiplied by ``ratio`` and the number of candidates is divided by the same ratio. Along with ``resource`` and ``min_resources``, ``ratio`` is the most important parameter to control the -search in our implementation. ``ratio`` effectively controls the number of +search in our implementation, though a value of 3 usually works well. ``ratio`` effectively controls the number of iterations in :class:`HalvingGridSearchCV` and the number of candidates (if 'auto') and iterations in :class:`HalvingRandomSearchCV`. ``aggressive_elimination=True`` can also be used if the number of available @@ -518,7 +518,7 @@ since it has reached the last iteration (3) with the highest score: `Non-stochastic Best Arm Identification and Hyperparameter Optimization `_, in proc. of Machine Learning Research, 2016. - .. [2] L. Li, K. Jamieson, G. DeSalvo, A. Rostamizadeh, .A Talwalkar, + .. [2] L. Li, K. Jamieson, G. DeSalvo, A. Rostamizadeh, A. Talwalkar, `Hyperband: A Novel Bandit-Based Approach to Hyperparameter Optimization `_, in Machine Learning Research 18, 2018. diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 7f67a11028b3e..d0b124f5599bb 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -640,7 +640,7 @@ def _run_search(self, evaluate_candidates): collected evaluation results. This makes it possible to implement Bayesian optimization or more generally sequential model-based optimization by deriving from the BaseSearchCV abstract base class. - For example, Succesive Halving is implemented by calling + For example, Successive Halving is implemented by calling `evaluate_candidates` multiples times (once per iteration of the SH process), each time passing a different set of candidates with `X` and `y` of increasing sizes. From b537ce7990daa89c40505edff1a08be0c420cdc1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 5 Sep 2020 13:38:09 -0400 Subject: [PATCH 82/89] Addressed comments to docs --- doc/modules/grid_search.rst | 79 +++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 1ebe75096552d..320b1f26775eb 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -183,15 +183,15 @@ halving (SH) is like a tournament among candidate parameter combinations. SH is an iterative selection process where all candidates (the parameter combinations) are evaluated with a small amount of resources at the first iteration. Only some of these candidates are selected for the next -iteration, which will be allocated more resources. What defines a resource is -typically the number of samples to train on, but it can also be an arbitrary -numeric parameter such as `n_estimators` in a random forest. +iteration, which will be allocated more resources. For parameter tuning, the +resource is typically the number of training samples, but it can also be an +arbitrary numeric parameter such as `n_estimators` in a random forest. -As illustrated in the figure below, only a small subset of candidates +As illustrated in the figure below, only a subset of candidates 'survive' until the last iteration. These are the candidates that have -consistently ranked among the best candidates across all iterations. Each -iteration is allocated an increasing amount of resources per candidate, here -the number of samples. +consistently ranked among the top-scoring candidates across all iterations. +Each iteration is allocated an increasing amount of resources per candidate, +here the number of samples. .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_successive_halving_iterations_001.png :target: ../auto_examples/model_selection/plot_successive_halving_iterations.html @@ -202,13 +202,13 @@ the rate at which the number of candidates decreases. In each iteration, the number of resources per candidate is multiplied by ``ratio`` and the number of candidates is divided by the same ratio. Along with ``resource`` and ``min_resources``, ``ratio`` is the most important parameter to control the -search in our implementation, though a value of 3 usually works well. ``ratio`` effectively controls the number of -iterations in :class:`HalvingGridSearchCV` and the number of candidates (if -'auto') and iterations in :class:`HalvingRandomSearchCV`. -``aggressive_elimination=True`` can also be used if the number of available -resources is small. More control is available through tuning the -``min_resources`` parameter. Each parameter and their interactions are -described in more details below. +search in our implementation, though a value of 3 usually works well. +``ratio`` effectively controls the number of iterations in +:class:`HalvingGridSearchCV` and the number of candidates (if 'auto') and +iterations in :class:`HalvingRandomSearchCV`. ``aggressive_elimination=True`` +can also be used if the number of available resources is small. More control +is available through tuning the ``min_resources`` parameter. Each parameter +and their interactions are described in more details below. These estimators are still **experimental**: their predictions and their API might change without any deprecation cycle. To use them, you @@ -242,20 +242,21 @@ samples: ``[10, 20, 40, 80, 160, 320, 640]``. But depending on the number of candidates, we might run less than 7 iterations: if we start with a **small** number of candidates, the last -iteration might use less than 640 samples, which is a waste of resources. For -example if we start with 5 candidates, we only need 2 iterations: 5 -candidates for the first iteration, then `5 // 2 = 2` candidates at the -second iteration, after which we know which candidate performs the best (so -we don't need a third one). We would only be using at most 20 samples which -is a waste since we have 1000 samples at our disposal. -On the other hand, if we start with a **high** number of candidates, we might -end up with a lot of candidates at the last iteration, which is not always -ideal. +iteration might use less than 640 samples, which means not using all the +available resources (samples). For example if we start with 5 candidates, we +only need 2 iterations: 5 candidates for the first iteration, then +`5 // 2 = 2` candidates at the second iteration, after which we know which +candidate performs the best (so we don't need a third one). We would only be +using at most 20 samples which is a waste since we have 1000 samples at our +disposal. On the other hand, if we start with a **high** number of +candidates, we might end up with a lot of candidates at the last iteration, +which may not always be ideal: it means that many candidates will run with +the full resources, basically reducing the procedure to standard search. In the case of :class:`HalvingRandomSearchCV`, the number of candidates is set -by default such that the maximum amount of resources is used at the last -iteration. For :class:`HalvingGridSearchCV`, the number of candidates is -determined by the `param_grid` parameter. Changing the value of +by default such that the last iteration uses as much of the available +resources as possible. For :class:`HalvingGridSearchCV`, the number of +candidates is determined by the `param_grid` parameter. Changing the value of ``min_resources`` will impact the number of possible iterations, and as a result will also have an effect on the ideal number of candidates. @@ -269,9 +270,13 @@ speed up the computation. Notice in the example above that the last iteration does not use the maximum amount of resources available: 1000 samples are available, yet only 640 are -used, at most. By default, ``min_resources`` is set to a specific value such -that the last iteration uses as many samples as possible. Please see -:ref:`exhausting_the_resources` for details. +used, at most. By default, both :class:`HalvingRandomSearchCV` and +:class:`HalvingGridSearchCV` try to use as many resources as possible in the +last iteration (with the constraint that this amount of resources must be a +multiple of both `min_resources` and `ratio`). :class:`HalvingRandomSearchCV` +achieves this by sampling the right amount of candidates, while +:class:`HalvingGridSearchCV` achieves this by properly setting +`min_resources`. Please see :ref:`exhausting_the_resources` for details. .. _amount_of_resource_and_number_of_candidates: @@ -329,8 +334,10 @@ We can note that: candidates: the best candidate is the best out of these 2 candidates. It is not necessary to run an additional iteration, since it would only evaluate one candidate (namely the best one, which we have already - identified). For this reason, **in general, we want the last iteration to - run at most ``ratio`` candidates**. + identified). For this reason, in general, we want the last iteration to + run at most ``ratio`` candidates. If the last iteration evaluates more + than `ratio` candidates, then this last iteration reduces to a regular + search (as in :class:`RandomizedSearchCV` or :class:`GridSearchCV`). - each ``resource_iter`` is a multiple of both ``ratio`` and ``min_resources`` (which is confirmed by its definition above). @@ -563,6 +570,9 @@ result in an error when using multiple metrics. See :ref:`sphx_glr_auto_examples_model_selection_plot_multi_metric_evaluation.py` for an example usage. +:class:`HalvingRandomSearchCV` and :class:`HalvingGridSearchCV` do not support +multimetric scoring. + .. _composite_grid_search: Composite estimators and parameter spaces @@ -628,9 +638,10 @@ utility function. Parallelism ----------- -The parameter search tools evaluate each parameter setting independently. -Computations can be run in parallel if your OS supports it, by using the -keyword ``n_jobs=-1``. See function signature for more details. +The parameter search tools evaluate each parameter combination on each data +fold independently. Computations can be run in parallel by using the keyword +``n_jobs=-1``. See function signature for more details, and also the Glossary +entry for :term:`n_jobs`. Robustness to failure --------------------- From 54a6276aa10bf573bc96acf12c9842bc6d29e656 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 5 Sep 2020 14:00:54 -0400 Subject: [PATCH 83/89] Addressed comments in examples --- doc/modules/grid_search.rst | 2 +- .../plot_successive_halving_heatmap.py | 35 +++++++------------ .../plot_successive_halving_iterations.py | 5 +-- 3 files changed, 15 insertions(+), 27 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 320b1f26775eb..267b4ccb104fd 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -215,7 +215,7 @@ and their API might change without any deprecation cycle. To use them, you need to explicitly import ``enable_successive_halving``:: >>> # explicitly require this experimental feature - >>> from sklearn.experimental import enable_successive_halving # noqa + >>> from sklearn.experimental import enable_successive_halving # noqa >>> # now you can import normally from model_selection >>> from sklearn.model_selection import HalvingGridSearchCV diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index 6f43ff367a6a2..b15ca9caff692 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -16,6 +16,7 @@ from sklearn.svm import SVC from sklearn import datasets from sklearn.model_selection import GridSearchCV +from sklearn.experimental import enable_successive_halving # noqa from sklearn.model_selection import HalvingGridSearchCV @@ -37,22 +38,13 @@ clf = SVC(random_state=rng) tic = time() -gsh = HalvingGridSearchCV( - estimator=clf, - param_grid=param_grid, - resource='n_samples', - max_resources='auto', # max_resources=n_samples - cv=5, - ratio=2, - random_state=rng) +gsh = HalvingGridSearchCV(estimator=clf, param_grid=param_grid, ratio=2, + random_state=rng) gsh.fit(X, y) gsh_time = time() - tic tic = time() -gs = GridSearchCV( - estimator=clf, - param_grid=param_grid, - cv=5) +gs = GridSearchCV(estimator=clf, param_grid=param_grid) gs.fit(X, y) gs_time = time() - tic @@ -66,15 +58,13 @@ def make_heatmap(ax, gs, is_sh=False, make_cbar=False): results['params_str'] = results.params.apply(str) if is_sh: # SH dataframe: get mean_test_score values for the highest iter - scores_matrix = ( - results.sort_values('iter').groupby(['param_gamma', 'param_C']) - .last()['mean_test_score'].unstack() + scores_matrix = results.sort_values('iter').pivot_table( + index='param_gamma', columns='param_C', + values='mean_test_score', aggfunc='last' ) else: - scores_matrix = ( - results.set_index(['param_gamma', 'param_C'])['mean_test_score'] - .unstack() - ) + scores_matrix = results.pivot(index='param_gamma', columns='param_C', + values='mean_test_score') im = ax.imshow(scores_matrix) @@ -91,11 +81,12 @@ def make_heatmap(ax, gs, is_sh=False, make_cbar=False): rotation_mode="anchor") if is_sh: - iterations = results.groupby(['param_gamma', 'param_C'])['iter'].max() - iterations_matrix = iterations.unstack().values + iterations = results.pivot_table(index='param_gamma', + columns='param_C', values='iter', + aggfunc='max').values for i in range(len(gammas)): for j in range(len(Cs)): - ax.text(j, i, iterations_matrix[i, j], + ax.text(j, i, iterations[i, j], ha="center", va="center", color="w", fontsize=20) if make_cbar: diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 35d78206953e8..9f6023dbe31d1 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -14,6 +14,7 @@ from scipy.stats import randint import numpy as np +from sklearn.experimental import enable_successive_halving # noqa from sklearn.model_selection import HalvingRandomSearchCV from sklearn.ensemble import RandomForestClassifier @@ -39,10 +40,6 @@ rsh = HalvingRandomSearchCV( estimator=clf, param_distributions=param_dist, - resource='n_samples', - max_resources='auto', # max_resources=n_samples - n_candidates='exhaust', - cv=5, ratio=2, random_state=rng) rsh.fit(X, y) From 8adf44e12ddc73a26ae3f776267347a52a3460a5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 6 Sep 2020 09:02:09 -0400 Subject: [PATCH 84/89] minor doc update --- doc/modules/grid_search.rst | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 267b4ccb104fd..1f307e9b3e8f3 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -197,18 +197,19 @@ here the number of samples. :target: ../auto_examples/model_selection/plot_successive_halving_iterations.html :align: center -The ``ratio`` parameter controls the rate at which the resources grow, and +We here briefly describe the main parameters, but each parameter and their +interactions are described in more details in the sections below. The +``ratio`` (> 1) parameter controls the rate at which the resources grow, and the rate at which the number of candidates decreases. In each iteration, the number of resources per candidate is multiplied by ``ratio`` and the number of candidates is divided by the same ratio. Along with ``resource`` and ``min_resources``, ``ratio`` is the most important parameter to control the search in our implementation, though a value of 3 usually works well. ``ratio`` effectively controls the number of iterations in -:class:`HalvingGridSearchCV` and the number of candidates (if 'auto') and +:class:`HalvingGridSearchCV` and the number of candidates (by default) and iterations in :class:`HalvingRandomSearchCV`. ``aggressive_elimination=True`` can also be used if the number of available resources is small. More control -is available through tuning the ``min_resources`` parameter. Each parameter -and their interactions are described in more details below. +is available through tuning the ``min_resources`` parameter. These estimators are still **experimental**: their predictions and their API might change without any deprecation cycle. To use them, you @@ -218,6 +219,7 @@ need to explicitly import ``enable_successive_halving``:: >>> from sklearn.experimental import enable_successive_halving # noqa >>> # now you can import normally from model_selection >>> from sklearn.model_selection import HalvingGridSearchCV + >>> from sklearn.model_selection import HalvingRandomSearchCV .. topic:: Examples: @@ -272,11 +274,12 @@ Notice in the example above that the last iteration does not use the maximum amount of resources available: 1000 samples are available, yet only 640 are used, at most. By default, both :class:`HalvingRandomSearchCV` and :class:`HalvingGridSearchCV` try to use as many resources as possible in the -last iteration (with the constraint that this amount of resources must be a -multiple of both `min_resources` and `ratio`). :class:`HalvingRandomSearchCV` -achieves this by sampling the right amount of candidates, while -:class:`HalvingGridSearchCV` achieves this by properly setting -`min_resources`. Please see :ref:`exhausting_the_resources` for details. +last iteration, with the constraint that this amount of resources must be a +multiple of both `min_resources` and `ratio` (this constraint will be clear +in the next section). :class:`HalvingRandomSearchCV` achieves this by +sampling the right amount of candidates, while :class:`HalvingGridSearchCV` +achieves this by properly setting `min_resources`. Please see +:ref:`exhausting_the_resources` for details. .. _amount_of_resource_and_number_of_candidates: From 3d9617821d4ac08f0eba34522091d6822374825c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 7 Sep 2020 19:36:54 -0400 Subject: [PATCH 85/89] minor renaming in UG --- doc/modules/grid_search.rst | 28 ++++++++------- .../_search_successive_halving.py | 36 +++++++++---------- .../tests/test_successive_halving.py | 10 +++--- 3 files changed, 39 insertions(+), 35 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 1f307e9b3e8f3..42c416745626c 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -287,21 +287,25 @@ Amount of resource and number of candidates at each iteration ------------------------------------------------------------- At any iteration `i`, each candidate is allocated a given amount of resources -which we denote `resource_iter`. This quantity is controlled by the +which we denote `n_resources_i`. This quantity is controlled by the parameters ``ratio`` and ``min_resources`` as follows (`ratio` is strictly greater than 1):: - resource_iter = ratio**i * min_resources, + n_resources_i = ratio**i * min_resources, -where ``min_resources`` is the amount of resources used at the first -iteration. ``ratio`` also defines the proportions of candidates that -will be selected for the next iteration:: +or equivalently:: + + n_resources_{i+1} = n_resources_i * ratio + +where ``min_resources == n_resources_0`` is the amount of resources used at +the first iteration. ``ratio`` also defines the proportions of candidates +that will be selected for the next iteration:: - n_candidates_iter = n_candidates // (ratio ** i) + n_candidates_i = n_candidates_0 // (ratio ** i) or equivalently:: - n_candidates_at_i+1 = n_candidates_at_i // ratio + n_candidates_{i+1} = n_candidates_i // ratio So in the first iteration, we use ``min_resources`` resources ``n_candidates`` times. In the second iteration, we use ``min_resources * @@ -316,7 +320,7 @@ Here is an example with ``min_resources=3`` and ``ratio=2``, starting with 70 candidates: +-----------------------+-----------------------+ -| ``resource_iter`` | ``n_candidates_at_i`` | +| ``n_resources`` | ``n_candidates_i`` | +=======================+=======================+ | 3 (=min_resources) | 70 (=n_candidates) | +-----------------------+-----------------------+ @@ -341,7 +345,7 @@ We can note that: run at most ``ratio`` candidates. If the last iteration evaluates more than `ratio` candidates, then this last iteration reduces to a regular search (as in :class:`RandomizedSearchCV` or :class:`GridSearchCV`). -- each ``resource_iter`` is a multiple of both ``ratio`` and +- each ``n_resources`` is a multiple of both ``ratio`` and ``min_resources`` (which is confirmed by its definition above). The amount of resources that is used at each iteration can be found in the @@ -481,7 +485,7 @@ necessary using ``min_resources`` resources:: [6, 3, 2] Notice that we end with 2 candidates at the last iteration since we have -eliminated enough candidates during the first iterations, using ``resource_iter = +eliminated enough candidates during the first iterations, using ``n_resources = min_resources = 20``. .. _successive_halving_cv_results: @@ -499,7 +503,7 @@ additional information related to the successive halving process. Here is an example with some of the columns of a (truncated) dataframe: ==== ====== =============== ================= ======================================================================================= - .. iter resource_iter mean_test_score params + .. iter n_resources mean_test_score params ==== ====== =============== ================= ======================================================================================= 0 0 125 0.983667 {'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 5} 1 0 125 0.983667 {'criterion': 'gini', 'max_depth': None, 'max_features': 8, 'min_samples_split': 7} @@ -514,7 +518,7 @@ Here is an example with some of the columns of a (truncated) dataframe: ==== ====== =============== ================= ======================================================================================= Each row corresponds to a given parameter combination (a candidate) and a given -iteration. The iteration is given by the ``iter`` column. The ``resource_iter`` +iteration. The iteration is given by the ``iter`` column. The ``n_resources`` column tells you how many resources were used. In the example above, the best parameter combination is ``{'criterion': diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 031dae4851db1..2afcae4756ee3 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -48,14 +48,14 @@ def _refit_callable(results): return last_iter_indices[best_idx] -def _top_k(results, k, iter_i): +def _top_k(results, k, itr): # Return the best candidates of a given iteration iteration, mean_test_score, params = ( np.asarray(a) for a in (results['iter'], results['mean_test_score'], results['params']) ) - iter_indices = np.flatnonzero(iteration == iter_i) + iter_indices = np.flatnonzero(iteration == itr) sorted_indices = np.argsort(mean_test_score[iter_indices]) return np.array(params[iter_indices][sorted_indices[-k:]]) @@ -269,58 +269,58 @@ def _run_search(self, evaluate_candidates): self.n_resources_ = [] self.n_candidates_ = [] - for iter_i in range(n_iterations): + for itr in range(n_iterations): - power = iter_i # default + power = itr # default if self.aggressive_elimination: - # this will set resource_iter to the initial value (i.e. the - # value of resource_iter at the first iteration) for as many + # this will set n_resources to the initial value (i.e. the + # value of n_resources at the first iteration) for as many # iterations as needed (while candidates are being # eliminated), and then go on as usual. power = max( 0, - iter_i - n_required_iterations + n_possible_iterations + itr - n_required_iterations + n_possible_iterations ) - resource_iter = int(self.ratio**power * self.min_resources_) + n_resources = int(self.ratio**power * self.min_resources_) # guard, probably not needed - resource_iter = min(resource_iter, self.max_resources_) - self.n_resources_.append(resource_iter) + n_resources = min(n_resources, self.max_resources_) + self.n_resources_.append(n_resources) n_candidates = len(candidate_params) self.n_candidates_.append(n_candidates) if self.verbose: print('-' * 10) - print(f'iter_i: {iter_i}') + print(f'iter: {itr}') print(f'n_candidates: {n_candidates}') - print(f'resource_iter: {resource_iter}') + print(f'n_resources: {n_resources}') if self.resource == 'n_samples': # subsampling will be done in cv.split() cv = _SubsampleMetaSplitter( base_cv=self._checked_cv_orig, - fraction=resource_iter / self._n_samples_orig, + fraction=n_resources / self._n_samples_orig, subsample_test=True, random_state=self.random_state ) else: - # Need copy so that the resource_iter of next iteration does + # Need copy so that the n_resources of next iteration does # not overwrite candidate_params = [c.copy() for c in candidate_params] for candidate in candidate_params: - candidate[self.resource] = resource_iter + candidate[self.resource] = n_resources cv = self._checked_cv_orig - more_results = {'iter': [iter_i] * n_candidates, - 'resource_iter': [resource_iter] * n_candidates} + more_results = {'iter': [itr] * n_candidates, + 'n_resources': [n_resources] * n_candidates} results = evaluate_candidates(candidate_params, cv, more_results=more_results) n_candidates_to_keep = ceil(n_candidates / self.ratio) - candidate_params = _top_k(results, n_candidates_to_keep, iter_i) + candidate_params = _top_k(results, n_candidates_to_keep, itr) self.n_remaining_candidates_ = len(candidate_params) self.n_required_iterations_ = n_required_iterations diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 9d377b6b9ba87..3dafe5dbb1b58 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -193,7 +193,7 @@ def test_resource_parameter(Est): max_resources=10, ratio=3) sh.fit(X, y) assert set(sh.n_resources_) == set([1, 3, 9]) - for r_i, params, param_c in zip(sh.cv_results_['resource_iter'], + for r_i, params, param_c in zip(sh.cv_results_['n_resources'], sh.cv_results_['params'], sh.cv_results_['param_c']): assert r_i == params['c'] == param_c @@ -379,7 +379,7 @@ def test_subsample_splitter_determinism(subsample_test): assert np.all(X[test_a] == X[test_b]) -@pytest.mark.parametrize('k, iter_i, expected', [ +@pytest.mark.parametrize('k, itr, expected', [ (1, 0, ['c']), (2, 0, ['a', 'c']), (4, 0, ['d', 'b', 'a', 'c']), @@ -392,14 +392,14 @@ def test_subsample_splitter_determinism(subsample_test): (1, 2, ['i']), (10, 2, ['g', 'h', 'i']), ]) -def test_top_k(k, iter_i, expected): +def test_top_k(k, itr, expected): results = { # this isn't a 'real world' result dict 'iter': [0, 0, 0, 0, 1, 1, 2, 2, 2], 'mean_test_score': [4, 3, 5, 1, 11, 10, 5, 6, 9], 'params': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'], } - got = _top_k(results, k=k, iter_i=iter_i) + got = _top_k(results, k=k, itr=itr) assert np.all(got == expected) @@ -561,4 +561,4 @@ def set_params(self, **params): assert (sh.n_candidates_ == counts).all() assert (cv_results_df['params'] == passed_params).all() - assert (cv_results_df['resource_iter'] == passed_n_samples).all() + assert (cv_results_df['n_resources'] == passed_n_samples).all() From e5bb4bb01012d0d613250767e93039c74a856132 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 7 Sep 2020 19:38:44 -0400 Subject: [PATCH 86/89] forgot some --- doc/modules/grid_search.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 42c416745626c..2be82e8a6e09a 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -301,10 +301,11 @@ where ``min_resources == n_resources_0`` is the amount of resources used at the first iteration. ``ratio`` also defines the proportions of candidates that will be selected for the next iteration:: - n_candidates_i = n_candidates_0 // (ratio ** i) + n_candidates_i = n_candidates // (ratio ** i) or equivalently:: + n_candidates_0 = n_candidates n_candidates_{i+1} = n_candidates_i // ratio So in the first iteration, we use ``min_resources`` resources @@ -320,7 +321,7 @@ Here is an example with ``min_resources=3`` and ``ratio=2``, starting with 70 candidates: +-----------------------+-----------------------+ -| ``n_resources`` | ``n_candidates_i`` | +| ``n_resources_i`` | ``n_candidates_i`` | +=======================+=======================+ | 3 (=min_resources) | 70 (=n_candidates) | +-----------------------+-----------------------+ @@ -345,7 +346,7 @@ We can note that: run at most ``ratio`` candidates. If the last iteration evaluates more than `ratio` candidates, then this last iteration reduces to a regular search (as in :class:`RandomizedSearchCV` or :class:`GridSearchCV`). -- each ``n_resources`` is a multiple of both ``ratio`` and +- each ``n_resources_i`` is a multiple of both ``ratio`` and ``min_resources`` (which is confirmed by its definition above). The amount of resources that is used at each iteration can be found in the @@ -503,7 +504,7 @@ additional information related to the successive halving process. Here is an example with some of the columns of a (truncated) dataframe: ==== ====== =============== ================= ======================================================================================= - .. iter n_resources mean_test_score params + .. iter n_resources mean_test_score params ==== ====== =============== ================= ======================================================================================= 0 0 125 0.983667 {'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 5} 1 0 125 0.983667 {'criterion': 'gini', 'max_depth': None, 'max_features': 8, 'min_samples_split': 7} From 9d2a6281aa04ec94e7aadc73ae0740861268e95e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 8 Sep 2020 09:47:40 -0400 Subject: [PATCH 87/89] some sad note about splitter statefulness :'( --- sklearn/model_selection/_search.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 3ba3c479b715f..43e5c78e07245 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -665,6 +665,16 @@ def _run_search(self, evaluate_candidates): It returns a dict of all results so far, formatted like ``cv_results_``. + Important note (relevant whether the default cv is used or not): + in randomized splitters, and unless the random_state parameter of + cv was set to an int, calling cv.split() multiple times will + yield different splits. Since cv.split() is called in + evaluate_candidates, this means that candidates will be evaluated + on different splits each time evaluate_candidates is called. This + might be a methodological issue depending on the search strategy + that you're implementing. To prevent randomized splitters from + being used, you may use _split._yields_constant_splits() + Examples -------- From 820ceb59d20612f6bfb994ccbd0ad5f83716cc91 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 8 Sep 2020 18:58:33 -0400 Subject: [PATCH 88/89] Addressed comments --- .../model_selection/plot_successive_halving_iterations.py | 2 +- sklearn/model_selection/_search.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 9f6023dbe31d1..b5e746536ae0e 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -1,5 +1,5 @@ """ -Successive halving Iterations +Successive Halving Iterations ============================= This example illustrates how a successive halving search ( diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 43e5c78e07245..cade49345d539 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -680,14 +680,12 @@ def _run_search(self, evaluate_candidates): :: - def _run_search(self, evaluate_candidates, X, y, **fit_params): + def _run_search(self, evaluate_candidates): 'Try C=0.1 only if C=1 is better than C=10' - all_results = evaluate_candidates([{'C': 1}, {'C': 10}], X, y, - **fit_params) + all_results = evaluate_candidates([{'C': 1}, {'C': 10}]) score = all_results['mean_test_score'] if score[0] < score[1]: - evaluate_candidates([{'C': 0.1}], X, y, - **fit_params) + evaluate_candidates([{'C': 0.1}]) """ raise NotImplementedError("_run_search not implemented.") From 645b50d32664218d9763bc5cb77d009291863a1f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 8 Sep 2020 21:34:59 -0400 Subject: [PATCH 89/89] ratio -> factor --- doc/modules/grid_search.rst | 62 +++++++++---------- .../plot_successive_halving_heatmap.py | 2 +- .../plot_successive_halving_iterations.py | 2 +- .../_search_successive_halving.py | 56 ++++++++--------- .../tests/test_successive_halving.py | 24 +++---- 5 files changed, 73 insertions(+), 73 deletions(-) diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 2be82e8a6e09a..c88a6eb986b5a 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -199,13 +199,13 @@ here the number of samples. We here briefly describe the main parameters, but each parameter and their interactions are described in more details in the sections below. The -``ratio`` (> 1) parameter controls the rate at which the resources grow, and +``factor`` (> 1) parameter controls the rate at which the resources grow, and the rate at which the number of candidates decreases. In each iteration, the -number of resources per candidate is multiplied by ``ratio`` and the number -of candidates is divided by the same ratio. Along with ``resource`` and -``min_resources``, ``ratio`` is the most important parameter to control the +number of resources per candidate is multiplied by ``factor`` and the number +of candidates is divided by the same factor. Along with ``resource`` and +``min_resources``, ``factor`` is the most important parameter to control the search in our implementation, though a value of 3 usually works well. -``ratio`` effectively controls the number of iterations in +``factor`` effectively controls the number of iterations in :class:`HalvingGridSearchCV` and the number of candidates (by default) and iterations in :class:`HalvingRandomSearchCV`. ``aggressive_elimination=True`` can also be used if the number of available resources is small. More control @@ -229,7 +229,7 @@ need to explicitly import ``enable_successive_halving``:: Choosing ``min_resources`` and the number of candidates ------------------------------------------------------- -Beside ``ratio``, the two main parameters that influence the behaviour of a +Beside ``factor``, the two main parameters that influence the behaviour of a successive halving search are the ``min_resources`` parameter, and the number of candidates (or parameter combinations) that are evaluated. ``min_resources`` is the amount of resources allocated at the first @@ -238,7 +238,7 @@ in :class:`HalvingRandomSearchCV`, and is determined from the ``param_grid`` parameter of :class:`HalvingGridSearchCV`. Consider a case where the resource is the number of samples, and where we -have 1000 samples. In theory, with ``min_resources=10`` and ``ratio=2``, we +have 1000 samples. In theory, with ``min_resources=10`` and ``factor=2``, we are able to run **at most** 7 iterations with the following number of samples: ``[10, 20, 40, 80, 160, 320, 640]``. @@ -275,7 +275,7 @@ amount of resources available: 1000 samples are available, yet only 640 are used, at most. By default, both :class:`HalvingRandomSearchCV` and :class:`HalvingGridSearchCV` try to use as many resources as possible in the last iteration, with the constraint that this amount of resources must be a -multiple of both `min_resources` and `ratio` (this constraint will be clear +multiple of both `min_resources` and `factor` (this constraint will be clear in the next section). :class:`HalvingRandomSearchCV` achieves this by sampling the right amount of candidates, while :class:`HalvingGridSearchCV` achieves this by properly setting `min_resources`. Please see @@ -288,36 +288,36 @@ Amount of resource and number of candidates at each iteration At any iteration `i`, each candidate is allocated a given amount of resources which we denote `n_resources_i`. This quantity is controlled by the -parameters ``ratio`` and ``min_resources`` as follows (`ratio` is strictly +parameters ``factor`` and ``min_resources`` as follows (`factor` is strictly greater than 1):: - n_resources_i = ratio**i * min_resources, + n_resources_i = factor**i * min_resources, or equivalently:: - n_resources_{i+1} = n_resources_i * ratio + n_resources_{i+1} = n_resources_i * factor where ``min_resources == n_resources_0`` is the amount of resources used at -the first iteration. ``ratio`` also defines the proportions of candidates +the first iteration. ``factor`` also defines the proportions of candidates that will be selected for the next iteration:: - n_candidates_i = n_candidates // (ratio ** i) + n_candidates_i = n_candidates // (factor ** i) or equivalently:: n_candidates_0 = n_candidates - n_candidates_{i+1} = n_candidates_i // ratio + n_candidates_{i+1} = n_candidates_i // factor So in the first iteration, we use ``min_resources`` resources ``n_candidates`` times. In the second iteration, we use ``min_resources * -ratio`` resources ``n_candidates // ratio`` times. The third again +factor`` resources ``n_candidates // factor`` times. The third again multiplies the resources per candidate and divides the number of candidates. This process stops when the maximum amount of resource per candidate is reached, or when we have identified the best candidate. The best candidate -is identified at the iteration that is evaluating `ratio` or less candidates +is identified at the iteration that is evaluating `factor` or less candidates (see just below for an explanation). -Here is an example with ``min_resources=3`` and ``ratio=2``, starting with +Here is an example with ``min_resources=3`` and ``factor=2``, starting with 70 candidates: +-----------------------+-----------------------+ @@ -338,15 +338,15 @@ Here is an example with ``min_resources=3`` and ``ratio=2``, starting with We can note that: -- the process stops at the first iteration which evaluates `ratio=2` +- the process stops at the first iteration which evaluates `factor=2` candidates: the best candidate is the best out of these 2 candidates. It is not necessary to run an additional iteration, since it would only evaluate one candidate (namely the best one, which we have already identified). For this reason, in general, we want the last iteration to - run at most ``ratio`` candidates. If the last iteration evaluates more - than `ratio` candidates, then this last iteration reduces to a regular + run at most ``factor`` candidates. If the last iteration evaluates more + than `factor` candidates, then this last iteration reduces to a regular search (as in :class:`RandomizedSearchCV` or :class:`GridSearchCV`). -- each ``n_resources_i`` is a multiple of both ``ratio`` and +- each ``n_resources_i`` is a multiple of both ``factor`` and ``min_resources`` (which is confirmed by its definition above). The amount of resources that is used at each iteration can be found in the @@ -372,7 +372,7 @@ terms of the number of estimators of a random forest:: >>> base_estimator = RandomForestClassifier(random_state=0) >>> X, y = make_classification(n_samples=1000, random_state=0) >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2, resource='n_estimators', + ... factor=2, resource='n_estimators', ... max_resources=30).fit(X, y) >>> sh.best_estimator_ RandomForestClassifier(max_depth=5, n_estimators=24, random_state=0) @@ -400,7 +400,7 @@ resources, some of them might be wasted (i.e. not used):: >>> base_estimator = SVC(gamma='scale') >>> X, y = make_classification(n_samples=1000) >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2, min_resources=20).fit(X, y) + ... factor=2, min_resources=20).fit(X, y) >>> sh.n_resources_ [20, 40, 80] @@ -414,13 +414,13 @@ such that the last iteration can use as many resources as possible, within the `max_resources` limit:: >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2, min_resources='exhaust').fit(X, y) + ... factor=2, min_resources='exhaust').fit(X, y) >>> sh.n_resources_ [250, 500, 1000] `min_resources` was here automatically set to 250, which results in the last iteration using all the resources. The exact value that is used depends on -the number of candidate parameter, on `max_resources` and on `ratio`. +the number of candidate parameter, on `max_resources` and on `factor`. For :class:`HalvingRandomSearchCV`, exhausting the resources can be done in 2 ways: @@ -441,11 +441,11 @@ candidate parameter, and is slightly more time-intensive. Aggressive elimination of candidates ------------------------------------ -Ideally, we want the last iteration to evaluate ``ratio`` candidates (see +Ideally, we want the last iteration to evaluate ``factor`` candidates (see :ref:`amount_of_resource_and_number_of_candidates`). We then just have to pick the best one. When the number of available resources is small with respect to the number of candidates, the last iteration may have to evaluate -more than ``ratio`` candidates:: +more than ``factor`` candidates:: >>> from sklearn.datasets import make_classification >>> from sklearn.svm import SVC @@ -459,7 +459,7 @@ more than ``ratio`` candidates:: >>> base_estimator = SVC(gamma='scale') >>> X, y = make_classification(n_samples=1000) >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2, max_resources=40, + ... factor=2, max_resources=40, ... aggressive_elimination=False).fit(X, y) >>> sh.n_resources_ [20, 40] @@ -467,16 +467,16 @@ more than ``ratio`` candidates:: [6, 3] Since we cannot use more than ``max_resources=40`` resources, the process -has to stop at the second iteration which evaluates more than ``ratio=2`` +has to stop at the second iteration which evaluates more than ``factor=2`` candidates. Using the ``aggressive_elimination`` parameter, you can force the search -process to end up with less than ``ratio`` candidates at the last +process to end up with less than ``factor`` candidates at the last iteration. To do this, the process will eliminate as many candidates as necessary using ``min_resources`` resources:: >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... ratio=2, + ... factor=2, ... max_resources=40, ... aggressive_elimination=True, ... ).fit(X, y) diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index b15ca9caff692..6964fafd77811 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -38,7 +38,7 @@ clf = SVC(random_state=rng) tic = time() -gsh = HalvingGridSearchCV(estimator=clf, param_grid=param_grid, ratio=2, +gsh = HalvingGridSearchCV(estimator=clf, param_grid=param_grid, factor=2, random_state=rng) gsh.fit(X, y) gsh_time = time() - tic diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index b5e746536ae0e..17723710be7d6 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -40,7 +40,7 @@ rsh = HalvingRandomSearchCV( estimator=clf, param_distributions=param_dist, - ratio=2, + factor=2, random_state=rng) rsh.fit(X, y) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 2afcae4756ee3..064948a30f006 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -71,7 +71,7 @@ def __init__(self, estimator, *, scoring=None, n_jobs=None, refit=True, cv=5, verbose=0, random_state=None, error_score=np.nan, return_train_score=True, max_resources='auto', min_resources='exhaust', - resource='n_samples', ratio=3, aggressive_elimination=False): + resource='n_samples', factor=3, aggressive_elimination=False): refit = _refit_callable if refit else False super().__init__(estimator, scoring=scoring, @@ -83,7 +83,7 @@ def __init__(self, estimator, *, scoring=None, self.random_state = random_state self.max_resources = max_resources self.resource = resource - self.ratio = ratio + self.factor = factor self.min_resources = min_resources self.aggressive_elimination = aggressive_elimination @@ -230,9 +230,9 @@ def _run_search(self, evaluate_candidates): ) # n_required_iterations is the number of iterations needed so that the - # last iterations evaluates less than `ratio` candidates. + # last iterations evaluates less than `factor` candidates. n_required_iterations = 1 + floor(log(len(candidate_params), - self.ratio)) + self.factor)) if self.min_resources == 'exhaust': # To exhaust the resources, we want to start with the biggest @@ -241,7 +241,7 @@ def _run_search(self, evaluate_candidates): last_iteration = n_required_iterations - 1 self.min_resources_ = max( self.min_resources_, - self.max_resources_ // self.ratio**last_iteration + self.max_resources_ // self.factor**last_iteration ) # n_possible_iterations is the number of iterations that we can @@ -250,7 +250,7 @@ def _run_search(self, evaluate_candidates): # candidates, this may be higher or smaller than # n_required_iterations. n_possible_iterations = 1 + floor(log( - self.max_resources_ // self.min_resources_, self.ratio)) + self.max_resources_ // self.min_resources_, self.factor)) if self.aggressive_elimination: n_iterations = n_required_iterations @@ -264,7 +264,7 @@ def _run_search(self, evaluate_candidates): print(f'min_resources_: {self.min_resources_}') print(f'max_resources_: {self.max_resources_}') print(f'aggressive_elimination: {self.aggressive_elimination}') - print(f'ratio: {self.ratio}') + print(f'factor: {self.factor}') self.n_resources_ = [] self.n_candidates_ = [] @@ -282,7 +282,7 @@ def _run_search(self, evaluate_candidates): itr - n_required_iterations + n_possible_iterations ) - n_resources = int(self.ratio**power * self.min_resources_) + n_resources = int(self.factor**power * self.min_resources_) # guard, probably not needed n_resources = min(n_resources, self.max_resources_) self.n_resources_.append(n_resources) @@ -319,7 +319,7 @@ def _run_search(self, evaluate_candidates): results = evaluate_candidates(candidate_params, cv, more_results=more_results) - n_candidates_to_keep = ceil(n_candidates / self.ratio) + n_candidates_to_keep = ceil(n_candidates / self.factor) candidate_params = _top_k(results, n_candidates_to_keep, itr) self.n_remaining_candidates_ = len(candidate_params) @@ -366,10 +366,10 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): in the list are explored. This enables searching over any sequence of parameter settings. - ratio : int or float, default=3 + factor : int or float, default=3 The 'halving' parameter, which determines the proportion of candidates that are selected for each subsequent iteration. For example, - ``ratio=3`` means that only one third of the candidates are selected. + ``factor=3`` means that only one third of the candidates are selected. resource : ``'n_samples'`` or str, default='n_samples' Defines the resource that increases with each iteration. By default, @@ -399,7 +399,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): - 'exhaust' will set `r0` such that the **last** iteration uses as much resources as possible. Namely, the last iteration will use the highest value smaller than ``max_resources`` that is a multiple of - both ``min_resources`` and ``ratio``. In general, using 'exhaust' + both ``min_resources`` and ``factor``. In general, using 'exhaust' leads to a more accurate estimator, but is slightly more time consuming. @@ -408,11 +408,11 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough resources to - reduce the remaining candidates to at most `ratio` after the last + reduce the remaining candidates to at most `factor` after the last iteration. If ``True``, then the search process will 'replay' the first iteration for as long as needed until the number of candidates is small enough. This is ``False`` by default, which means that the - last iteration may evaluate more than ``ratio`` candidates. See + last iteration may evaluate more than ``factor`` candidates. See :ref:`aggressive_elimination` for more details. cv : int, cross-validation generator or iterable, default=5 @@ -491,7 +491,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): n_remaining_candidates_ : int The number of candidate parameters that are left after the last - iteration. It corresponds to `ceil(n_candidates[-1] / ratio)` + iteration. It corresponds to `ceil(n_candidates[-1] / factor)` max_resources_ : int The maximum number of resources that any candidate is allowed to use @@ -517,7 +517,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): n_required_iterations_ : int The number of iterations that are required to end up with less than - ``ratio`` candidates at the last iteration, starting with + ``factor`` candidates at the last iteration, starting with ``min_resources_`` resources. This will be smaller than ``n_possible_iterations_`` when there isn't enough resources. @@ -591,7 +591,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): _required_parameters = ["estimator", "param_grid"] def __init__(self, estimator, param_grid, *, - ratio=3, resource='n_samples', max_resources='auto', + factor=3, resource='n_samples', max_resources='auto', min_resources='exhaust', aggressive_elimination=False, cv=5, scoring=None, refit=True, error_score=np.nan, return_train_score=True, random_state=None, n_jobs=None, @@ -601,7 +601,7 @@ def __init__(self, estimator, param_grid, *, random_state=random_state, error_score=error_score, return_train_score=return_train_score, max_resources=max_resources, resource=resource, - ratio=ratio, min_resources=min_resources, + factor=factor, min_resources=min_resources, aggressive_elimination=aggressive_elimination) self.param_grid = param_grid _check_param_grid(self.param_grid) @@ -650,13 +650,13 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): The number of candidate parameters to sample, at the first iteration. Using 'exhaust' will sample enough candidates so that the last iteration uses as many resources as possible, based on - `min_resources`, `max_resources` and `ratio`. In this case, + `min_resources`, `max_resources` and `factor`. In this case, `min_resources` cannot be 'exhaust'. - ratio : int or float, default=3 + factor : int or float, default=3 The 'halving' parameter, which determines the proportion of candidates that are selected for each subsequent iteration. For example, - ``ratio=3`` means that only one third of the candidates are selected. + ``factor=3`` means that only one third of the candidates are selected. resource : ``'n_samples'`` or str, default='n_samples' Defines the resource that increases with each iteration. By default, @@ -686,7 +686,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): - 'exhaust' will set `r0` such that the **last** iteration uses as much resources as possible. Namely, the last iteration will use the highest value smaller than ``max_resources`` that is a multiple of - both ``min_resources`` and ``ratio``. In general, using 'exhaust' + both ``min_resources`` and ``factor``. In general, using 'exhaust' leads to a more accurate estimator, but is slightly more time consuming. 'exhaust' isn't available when `n_candidates='exhaust'`. @@ -695,11 +695,11 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): aggressive_elimination : bool, default=False This is only relevant in cases where there isn't enough resources to - reduce the remaining candidates to at most `ratio` after the last + reduce the remaining candidates to at most `factor` after the last iteration. If ``True``, then the search process will 'replay' the first iteration for as long as needed until the number of candidates is small enough. This is ``False`` by default, which means that the - last iteration may evaluate more than ``ratio`` candidates. See + last iteration may evaluate more than ``factor`` candidates. See :ref:`aggressive_elimination` for more details. cv : int, cross-validation generator or an iterable, default=5 @@ -780,7 +780,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): n_remaining_candidates_ : int The number of candidate parameters that are left after the last - iteration. It corresponds to `ceil(n_candidates[-1] / ratio)` + iteration. It corresponds to `ceil(n_candidates[-1] / factor)` max_resources_ : int The maximum number of resources that any candidate is allowed to use @@ -806,7 +806,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): n_required_iterations_ : int The number of iterations that are required to end up with less than - ``ratio`` candidates at the last iteration, starting with + ``factor`` candidates at the last iteration, starting with ``min_resources_`` resources. This will be smaller than ``n_possible_iterations_`` when there isn't enough resources. @@ -883,7 +883,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): _required_parameters = ["estimator", "param_distributions"] def __init__(self, estimator, param_distributions, *, - n_candidates='exhaust', ratio=3, resource='n_samples', + n_candidates='exhaust', factor=3, resource='n_samples', max_resources='auto', min_resources='smallest', aggressive_elimination=False, cv=5, scoring=None, refit=True, error_score=np.nan, return_train_score=True, @@ -893,7 +893,7 @@ def __init__(self, estimator, param_distributions, *, random_state=random_state, error_score=error_score, return_train_score=return_train_score, max_resources=max_resources, resource=resource, - ratio=ratio, min_resources=min_resources, + factor=factor, min_resources=min_resources, aggressive_elimination=aggressive_elimination) self.param_distributions = param_distributions self.n_candidates = n_candidates diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 3dafe5dbb1b58..eeb941bd25a06 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -44,10 +44,10 @@ def get_params(self, deep=False): 'expected_n_resources,'), [ # notice how it loops at the beginning # also, the number of candidates evaluated at the last iteration is - # <= ratio + # <= factor (True, 'limited', 4, 4, 3, 1, [60, 20, 7, 3], [20, 20, 60, 180]), # no aggressive elimination: we end up with less iterations, and - # the number of candidates at the last iter is > ratio, which isn't + # the number of candidates at the last iter is > factor, which isn't # ideal (False, 'limited', 3, 4, 3, 3, [60, 20, 7], [20, 60, 180]), # # When the amount of resource isn't limited, aggressive_elimination @@ -76,7 +76,7 @@ def test_aggressive_elimination( sh = Est(base_estimator, param_grid, aggressive_elimination=aggressive_elimination, - max_resources=max_resources, ratio=3) + max_resources=max_resources, factor=3) sh.set_params(verbose=True) # just for test coverage if Est is HalvingRandomSearchCV: @@ -91,7 +91,7 @@ def test_aggressive_elimination( assert sh.n_resources_ == expected_n_resources assert sh.n_candidates_ == expected_n_candidates assert sh.n_remaining_candidates_ == expected_n_remaining_candidates - assert ceil(sh.n_candidates_[-1] / sh.ratio) == sh.n_remaining_candidates_ + assert ceil(sh.n_candidates_[-1] / sh.factor) == sh.n_remaining_candidates_ @pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV)) @@ -130,14 +130,14 @@ def test_min_max_resources( param_grid = {'a': [1, 2], 'b': [1, 2, 3]} base_estimator = FastClassifier() - sh = Est(base_estimator, param_grid, ratio=3, min_resources=min_resources, + sh = Est(base_estimator, param_grid, factor=3, min_resources=min_resources, max_resources=max_resources) if Est is HalvingRandomSearchCV: sh.set_params(n_candidates=6) # same number as with the grid sh.fit(X, y) - expected_n_required_iterations = 2 # given 6 combinations and ratio = 3 + expected_n_required_iterations = 2 # given 6 combinations and factor = 3 assert sh.n_iterations_ == expected_n_iterations assert sh.n_required_iterations_ == expected_n_required_iterations assert sh.n_possible_iterations_ == expected_n_possible_iterations @@ -169,9 +169,9 @@ def test_n_iterations(Est, max_resources, n_iterations, n_possible_iterations): X, y = make_classification(n_samples=n_samples, random_state=1) param_grid = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() - ratio = 2 + factor = 2 - sh = Est(base_estimator, param_grid, cv=2, ratio=ratio, + sh = Est(base_estimator, param_grid, cv=2, factor=factor, max_resources=max_resources, min_resources=4) if Est is HalvingRandomSearchCV: sh.set_params(n_candidates=20) # same as for HalvingGridSearchCV @@ -190,7 +190,7 @@ def test_resource_parameter(Est): param_grid = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() sh = Est(base_estimator, param_grid, cv=2, resource='c', - max_resources=10, ratio=3) + max_resources=10, factor=3) sh.fit(X, y) assert set(sh.n_resources_) == set([1, 3, 9]) for r_i, params, param_c in zip(sh.cv_results_['n_resources'], @@ -233,7 +233,7 @@ def test_random_search(max_resources, n_candidates, expected_n_candidates): base_estimator = FastClassifier() sh = HalvingRandomSearchCV(base_estimator, param_grid, n_candidates=n_candidates, cv=2, - max_resources=max_resources, ratio=2, + max_resources=max_resources, factor=2, min_resources=4) sh.fit(X, y) assert sh.n_candidates_[0] == expected_n_candidates @@ -432,7 +432,7 @@ def test_cv_results(Est): def scorer(est, X, y): return rng.rand() - sh = Est(base_estimator, param_grid, ratio=2, scoring=scorer) + sh = Est(base_estimator, param_grid, factor=2, scoring=scorer) if Est is HalvingRandomSearchCV: # same number of candidates as with the grid sh.set_params(n_candidates=2 * 30, min_resources='exhaust') @@ -533,7 +533,7 @@ def set_params(self, **params): param_grid = {'a': ('l1', 'l2'), 'b': list(range(30))} base_estimator = FastClassifierBookKeeping() - sh = Est(base_estimator, param_grid, ratio=2, cv=n_splits, + sh = Est(base_estimator, param_grid, factor=2, cv=n_splits, return_train_score=False, refit=False) if Est is HalvingRandomSearchCV: # same number of candidates as with the grid