From 62db7fa7cd13237456f8a68985a60d415a2469ca Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 24 Jul 2023 23:36:23 +0200 Subject: [PATCH 1/6] HalvingRandomSearchCV with list format input and test attempted --- .../_search_successive_halving.py | 8 +- sklearn/model_selection/tests/test_search.py | 18 ++++- .../tests/test_successive_halving.py | 78 ++++++++++++++++++- 3 files changed, 97 insertions(+), 7 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 38fe1e0e7a15c..708092d09a2a5 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -750,11 +750,13 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): Either estimator needs to provide a ``score`` function, or ``scoring`` must be passed. - param_distributions : dict - Dictionary with parameters names (string) as keys and distributions + param_distributions : dict or list of dicts + Dictionary with parameters names (`str`) as keys and distributions or lists of parameters to try. Distributions must provide a ``rvs`` method for sampling (such as those from scipy.stats.distributions). If a list is given, it is sampled uniformly. + If a list of dicts is given, first a dict is sampled uniformly, and + then a parameter is sampled using that dict as above. n_candidates : "exhaust" or int, default="exhaust" The number of candidate parameters to sample, at the first @@ -1024,7 +1026,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): _parameter_constraints: dict = { **BaseSuccessiveHalving._parameter_constraints, - "param_distributions": [dict], + "param_distributions": [dict, list], "n_candidates": [ Interval(Integral, 0, None, closed="neither"), StrOptions({"exhaust"}), diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 6ea52049f3ced..23fc9d2bb4988 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -885,9 +885,21 @@ def test_param_sampler(): def check_cv_results_array_types(search, param_keys, score_keys): # Check if the search `cv_results`'s array are of correct types cv_results = search.cv_results_ - assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys) - assert all(cv_results[key].dtype == object for key in param_keys) - assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys) + assert all( + isinstance(cv_results[param], np.ma.MaskedArray) + for param in param_keys + if param not in {"iter", "n_resources"} + ) + assert all( + cv_results[key].dtype == object + for key in param_keys + if key not in {"iter", "n_resources"} + ) + assert not any( + isinstance(cv_results[key], np.ma.MaskedArray) + for key in score_keys + if key not in {"iter", "n_resources"} + ) assert all( cv_results[key].dtype == np.float64 for key in score_keys diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index d4cc09ee01044..e098fda81c4c8 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -23,7 +23,11 @@ _SubsampleMetaSplitter, _top_k, ) -from sklearn.svm import LinearSVC +from sklearn.model_selection.tests.test_search import ( + check_cv_results_array_types, + check_cv_results_keys, +) +from sklearn.svm import SVC, LinearSVC class FastClassifier(DummyClassifier): @@ -777,3 +781,75 @@ def test_select_best_index(SearchCV): # we expect the index of 'i' best_index = SearchCV._select_best_index(None, None, results) assert best_index == 8 + + +def test_halving_grid_search_cv_results(): + X, y = make_classification(n_samples=50, n_features=4, random_state=42) + + n_splits = 3 + n_proportion = 8 + + params = [ + {"kernel": ["rbf"], "C": [0.01, 1.0], "gamma": ["scale", "auto"]}, + {"kernel": ["poly"], "degree": [2, 3]}, + ] + param_keys = ( + "param_C", + "param_degree", + "param_gamma", + "param_kernel", + "iter", + "n_resources", + ) + score_keys = ( + "mean_test_score", + "mean_train_score", + "rank_test_score", + "split0_test_score", + "split1_test_score", + "split2_test_score", + "split0_train_score", + "split1_train_score", + "split2_train_score", + "std_test_score", + "std_train_score", + "mean_fit_time", + "std_fit_time", + "mean_score_time", + "std_score_time", + ) + # n_cand = n_search_iter + n_cand = n_proportion + + search = HalvingGridSearchCV( + SVC(), + cv=n_splits, + param_grid=params, + return_train_score=True, + ) + search.fit(X, y) + cv_results = search.cv_results_ + # Check results structure + check_cv_results_array_types(search, param_keys, score_keys) + check_cv_results_keys(cv_results, param_keys, score_keys, n_cand) + len(search.cv_results_["params"]) + + +""" assert all( + ( + not cv_results["param_C"].mask[i] + and not cv_results["param_gamma"].mask[i] + and not cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "poly" + ) + assert all( + ( + not cv_results["param_C"].mask[i] + and not cv_results["param_gamma"].mask[i] + and cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "rbf" + )""" From 6c7e4cb13d1f58b9b0ad676a04c7c2ad01623d22 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Tue, 25 Jul 2023 00:59:03 +0200 Subject: [PATCH 2/6] swapped to HalvingRandomSearchCV, test outcommented, now fails --- sklearn/model_selection/tests/test_search.py | 6 +---- .../tests/test_successive_halving.py | 24 +++++++------------ 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 23fc9d2bb4988..54556a4761ae5 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -895,11 +895,7 @@ def check_cv_results_array_types(search, param_keys, score_keys): for key in param_keys if key not in {"iter", "n_resources"} ) - assert not any( - isinstance(cv_results[key], np.ma.MaskedArray) - for key in score_keys - if key not in {"iter", "n_resources"} - ) + assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys) assert all( cv_results[key].dtype == np.float64 for key in score_keys diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index e098fda81c4c8..b9b01e4dfe444 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -23,11 +23,7 @@ _SubsampleMetaSplitter, _top_k, ) -from sklearn.model_selection.tests.test_search import ( - check_cv_results_array_types, - check_cv_results_keys, -) -from sklearn.svm import SVC, LinearSVC +from sklearn.svm import LinearSVC class FastClassifier(DummyClassifier): @@ -783,14 +779,14 @@ def test_select_best_index(SearchCV): assert best_index == 8 -def test_halving_grid_search_cv_results(): +"""def test_halving_random_search_cv_results(): X, y = make_classification(n_samples=50, n_features=4, random_state=42) n_splits = 3 - n_proportion = 8 + n_proportion = 6 params = [ - {"kernel": ["rbf"], "C": [0.01, 1.0], "gamma": ["scale", "auto"]}, + {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)}, {"kernel": ["poly"], "degree": [2, 3]}, ] param_keys = ( @@ -818,13 +814,12 @@ def test_halving_grid_search_cv_results(): "mean_score_time", "std_score_time", ) - # n_cand = n_search_iter n_cand = n_proportion - search = HalvingGridSearchCV( + search = HalvingRandomSearchCV( SVC(), cv=n_splits, - param_grid=params, + param_distributions=params, return_train_score=True, ) search.fit(X, y) @@ -834,14 +829,13 @@ def test_halving_grid_search_cv_results(): check_cv_results_keys(cv_results, param_keys, score_keys, n_cand) len(search.cv_results_["params"]) - -""" assert all( + assert all( ( not cv_results["param_C"].mask[i] and not cv_results["param_gamma"].mask[i] and not cv_results["param_degree"].mask[i] ) - for i in range(n_candidates) + for i in range(n_cand) if cv_results["param_kernel"][i] == "poly" ) assert all( @@ -850,6 +844,6 @@ def test_halving_grid_search_cv_results(): and not cv_results["param_gamma"].mask[i] and cv_results["param_degree"].mask[i] ) - for i in range(n_candidates) + for i in range(n_cand) if cv_results["param_kernel"][i] == "rbf" )""" From 6227903b9f440c14988f98e144599fb2746876d6 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Wed, 26 Jul 2023 20:50:05 +0200 Subject: [PATCH 3/6] changes after review --- doc/whats_new/v1.3.rst | 4 ++ sklearn/model_selection/tests/test_search.py | 19 +++------- .../tests/test_successive_halving.py | 37 +++++++++---------- 3 files changed, 26 insertions(+), 34 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 8d39ca2fed143..6ba517f4e24a9 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -23,6 +23,10 @@ Changelog :attr:`sklearn.neighbors.KDTree.valid_metrics` as public class attributes. :pr:`26754` by :user:`Julien Jerphanion `. +- |Fix| :class:`sklearn.model_selection.HalvingRandomSearchCV` no longer raises + when the input to the `param_distributions` parameter is a list of dicts. + :pr:`26893` by :user:`Stefanie Senger `. + .. _changes_1_3: Version 1.3.0 diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 54556a4761ae5..5bdfa0e118571 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -885,16 +885,8 @@ def test_param_sampler(): def check_cv_results_array_types(search, param_keys, score_keys): # Check if the search `cv_results`'s array are of correct types cv_results = search.cv_results_ - assert all( - isinstance(cv_results[param], np.ma.MaskedArray) - for param in param_keys - if param not in {"iter", "n_resources"} - ) - assert all( - cv_results[key].dtype == object - for key in param_keys - if key not in {"iter", "n_resources"} - ) + assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys) + assert all(cv_results[key].dtype == object for key in param_keys) assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys) assert all( cv_results[key].dtype == np.float64 @@ -908,11 +900,10 @@ def check_cv_results_array_types(search, param_keys, score_keys): assert cv_results["rank_test_%s" % key].dtype == np.int32 -def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand): +def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys=()): # Test the search.cv_results_ contains all the required results - assert_array_equal( - sorted(cv_results.keys()), sorted(param_keys + score_keys + ("params",)) - ) + all_keys = param_keys + score_keys + extra_keys + assert_array_equal(sorted(cv_results.keys()), sorted(all_keys + ("params",))) assert all(cv_results[key].shape == (n_cand,) for key in param_keys + score_keys) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index b9b01e4dfe444..cf9e467f1d716 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from scipy.stats import norm, randint +from scipy.stats import expon, norm, randint from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier @@ -23,7 +23,11 @@ _SubsampleMetaSplitter, _top_k, ) -from sklearn.svm import LinearSVC +from sklearn.model_selection.tests.test_search import ( + check_cv_results_array_types, + check_cv_results_keys, +) +from sklearn.svm import SVC, LinearSVC class FastClassifier(DummyClassifier): @@ -779,11 +783,8 @@ def test_select_best_index(SearchCV): assert best_index == 8 -"""def test_halving_random_search_cv_results(): - X, y = make_classification(n_samples=50, n_features=4, random_state=42) - - n_splits = 3 - n_proportion = 6 +def test_halving_random_search_cv_results(): + X, y = make_classification(n_samples=150, n_features=4, random_state=42) params = [ {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)}, @@ -794,8 +795,6 @@ def test_select_best_index(SearchCV): "param_degree", "param_gamma", "param_kernel", - "iter", - "n_resources", ) score_keys = ( "mean_test_score", @@ -814,28 +813,26 @@ def test_select_best_index(SearchCV): "mean_score_time", "std_score_time", ) - n_cand = n_proportion + extra_keys = ("n_resources", "iter") search = HalvingRandomSearchCV( - SVC(), - cv=n_splits, - param_distributions=params, - return_train_score=True, + SVC(), cv=3, param_distributions=params, return_train_score=True, random_state=0 ) search.fit(X, y) + n_candidates = sum(search.n_candidates_) cv_results = search.cv_results_ # Check results structure + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys) check_cv_results_array_types(search, param_keys, score_keys) - check_cv_results_keys(cv_results, param_keys, score_keys, n_cand) len(search.cv_results_["params"]) assert all( ( - not cv_results["param_C"].mask[i] - and not cv_results["param_gamma"].mask[i] + cv_results["param_C"].mask[i] + and cv_results["param_gamma"].mask[i] and not cv_results["param_degree"].mask[i] ) - for i in range(n_cand) + for i in range(n_candidates) if cv_results["param_kernel"][i] == "poly" ) assert all( @@ -844,6 +841,6 @@ def test_select_best_index(SearchCV): and not cv_results["param_gamma"].mask[i] and cv_results["param_degree"].mask[i] ) - for i in range(n_cand) + for i in range(n_candidates) if cv_results["param_kernel"][i] == "rbf" - )""" + ) From 98f941287e07d3be5a472ed48b1e15aff76325ef Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 27 Jul 2023 14:19:54 +0200 Subject: [PATCH 4/6] bug fix and cleanup in test_grid_search_cv_results and test_random_search_cv_results --- sklearn/model_selection/tests/test_search.py | 32 +++++++++---------- .../tests/test_successive_halving.py | 1 - 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 5bdfa0e118571..9c828a3ea0075 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -910,7 +910,6 @@ def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys def test_grid_search_cv_results(): X, y = make_classification(n_samples=50, n_features=4, random_state=42) - n_splits = 3 n_grid_points = 6 params = [ dict( @@ -948,9 +947,7 @@ def test_grid_search_cv_results(): ) n_candidates = n_grid_points - search = GridSearchCV( - SVC(), cv=n_splits, param_grid=params, return_train_score=True - ) + search = GridSearchCV(SVC(), cv=3, param_grid=params, return_train_score=True) search.fit(X, y) cv_results = search.cv_results_ # Check if score and timing are reasonable @@ -966,17 +963,20 @@ def test_grid_search_cv_results(): check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) # Check masking cv_results = search.cv_results_ - n_candidates = len(search.cv_results_["params"]) - assert all( + + poly_results = [ ( cv_results["param_C"].mask[i] and cv_results["param_gamma"].mask[i] and not cv_results["param_degree"].mask[i] ) for i in range(n_candidates) - if cv_results["param_kernel"][i] == "linear" - ) - assert all( + if cv_results["param_kernel"][i] == "poly" + ] + assert all(poly_results) + assert len(poly_results) + + rbf_results = [ ( not cv_results["param_C"].mask[i] and not cv_results["param_gamma"].mask[i] @@ -984,13 +984,14 @@ def test_grid_search_cv_results(): ) for i in range(n_candidates) if cv_results["param_kernel"][i] == "rbf" - ) + ] + assert all(rbf_results) + assert len(rbf_results) def test_random_search_cv_results(): X, y = make_classification(n_samples=50, n_features=4, random_state=42) - n_splits = 3 n_search_iter = 30 params = [ @@ -1015,12 +1016,12 @@ def test_random_search_cv_results(): "mean_score_time", "std_score_time", ) - n_cand = n_search_iter + n_candidates = n_search_iter search = RandomizedSearchCV( SVC(), n_iter=n_search_iter, - cv=n_splits, + cv=3, param_distributions=params, return_train_score=True, ) @@ -1028,8 +1029,7 @@ def test_random_search_cv_results(): cv_results = search.cv_results_ # Check results structure check_cv_results_array_types(search, param_keys, score_keys) - check_cv_results_keys(cv_results, param_keys, score_keys, n_cand) - n_candidates = len(search.cv_results_["params"]) + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) assert all( ( cv_results["param_C"].mask[i] @@ -1037,7 +1037,7 @@ def test_random_search_cv_results(): and not cv_results["param_degree"].mask[i] ) for i in range(n_candidates) - if cv_results["param_kernel"][i] == "linear" + if cv_results["param_kernel"][i] == "poly" ) assert all( ( diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index cf9e467f1d716..da5de99070246 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -824,7 +824,6 @@ def test_halving_random_search_cv_results(): # Check results structure check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys) check_cv_results_array_types(search, param_keys, score_keys) - len(search.cv_results_["params"]) assert all( ( From 4d90784dad25dae56a924c0d0b1fdd8692d78894 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 28 Jul 2023 15:22:04 +0200 Subject: [PATCH 5/6] specific length test --- sklearn/model_selection/tests/test_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 9c828a3ea0075..04c3f1f156fab 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -974,7 +974,7 @@ def test_grid_search_cv_results(): if cv_results["param_kernel"][i] == "poly" ] assert all(poly_results) - assert len(poly_results) + assert len(poly_results) == 2 rbf_results = [ ( @@ -986,7 +986,7 @@ def test_grid_search_cv_results(): if cv_results["param_kernel"][i] == "rbf" ] assert all(rbf_results) - assert len(rbf_results) + assert len(rbf_results) == 4 def test_random_search_cv_results(): From 564ab3558eb45f1198f17f62653b05a682a9fcd2 Mon Sep 17 00:00:00 2001 From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> Date: Fri, 4 Aug 2023 13:30:26 +0200 Subject: [PATCH 6/6] Update sklearn/model_selection/tests/test_successive_halving.py Co-authored-by: Guillaume Lemaitre --- sklearn/model_selection/tests/test_successive_halving.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index da5de99070246..6c89f89afa684 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -783,7 +783,10 @@ def test_select_best_index(SearchCV): assert best_index == 8 -def test_halving_random_search_cv_results(): +def test_halving_random_search_list_of_dicts(): + """Check the behaviour of the `HalvingRandomSearchCV` with `param_distribution` + being a list of dictionary. + """ X, y = make_classification(n_samples=150, n_features=4, random_state=42) params = [