8000 FIX `param_distribution` param of `HalvingRandomSearchCV` accepts list of dicts by StefanieSenger · Pull Request #26893 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

FIX param_distribution param of HalvingRandomSearchCV accepts list of dicts #26893

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Aug 7, 2023
4 changes: 4 additions & 0 deletions doc/whats_new/v1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ Changelog
:attr:`sklearn.neighbors.KDTree.valid_metrics` as public class attributes.
:pr:`26754` by :user:`Julien Jerphanion <jjerphan>`.

- |Fix| :class:`sklearn.model_selection.HalvingRandomSearchCV` no longer raises
when the input to the `param_distributions` parameter is a list of dicts.
:pr:`26893` by :user:`Stefanie Senger <StefanieSenger>`.

:mod:`sklearn.preprocessing`
............................

Expand Down
8 changes: 5 additions & 3 deletions sklearn/model_selection/_search_successive_halving.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,11 +750,13 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving):
Either estimator needs to provide a ``score`` function,
or ``scoring`` must be passed.

param_distributions : dict
Dictionary with parameters names (string) as keys and distributions
param_distributions : dict or list of dicts
Dictionary with parameters names (`str`) as keys and distributions
or lists of parameters to try. Distributions must provide a ``rvs``
method for sampling (such as those from scipy.stats.distributions).
If a list is given, it is sampled uniformly.
If a list of dicts is given, first a dict is sampled uniformly, and
then a parameter is sampled using that dict as above.

n_candidates : "exhaust" or int, default="exhaust"
The number of candidate parameters to sample, at the first
Expand Down Expand Up @@ -1024,7 +1026,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving):

_parameter_constraints: dict = {
**BaseSuccessiveHalving._parameter_constraints,
"param_distributions": [dict],
"param_distributions": [dict, list],
"n_candidates": [
Interval(Integral, 0, None, closed="neither"),
StrOptions({"exhaust"}),
Expand Down
39 changes: 19 additions & 20 deletions sklearn/model_selection/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -900,18 +900,16 @@ def check_cv_results_array_types(search, param_keys, score_keys):
assert cv_results["rank_test_%s" % key].dtype == np.int32


def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand):
def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys=()):
# Test the search.cv_results_ contains all the required results
assert_array_equal(
sorted(cv_results.keys()), sorted(param_keys + score_keys + ("params",))
)
all_keys = param_keys + score_keys + extra_keys
assert_array_equal(sorted(cv_results.keys()), sorted(all_keys + ("params",)))
assert all(cv_results[key].shape == (n_cand,) for key in param_keys + score_keys)


def test_grid_s 10000 earch_cv_results():
X, y = make_classification(n_samples=50, n_features=4, random_state=42)

n_splits = 3
n_grid_points = 6
params = [
dict(
Expand Down Expand Up @@ -949,9 +947,7 @@ def test_grid_search_cv_results():
)
n_candidates = n_grid_points

search = GridSearchCV(
SVC(), cv=n_splits, param_grid=params, return_train_score=True
)
search = GridSearchCV(SVC(), cv=3, param_grid=params, return_train_score=True)
search.fit(X, y)
cv_results = search.cv_results_
# Check if score and timing are reasonable
Expand All @@ -967,31 +963,35 @@ def test_grid_search_cv_results():
check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
# Check masking
cv_results = search.cv_results_
n_candidates = len(search.cv_results_["params"])
assert all(

poly_results = [
(
cv_results["param_C"].mask[i]
and cv_results["param_gamma"].mask[i]
and not cv_results["param_degree"].mask[i]
)
for i in range(n_candidates)
if cv_results["param_kernel"][i] == "linear"
)
assert all(
if cv_results["param_kernel"][i] == "poly"
]
assert all(poly_results)
assert len(poly_results) == 2

rbf_results = [
(
not cv_results["param_C"].mask[i]
and not cv_results["param_gamma"].mask[i]
and cv_results["param_degree"].mask[i]
)
for i in range(n_candidates)
if cv_results["param_kernel"][i] == "rbf"
)
]
assert all(rbf_results)
assert len(rbf_results) == 4


def test_random_search_cv_results():
X, y = make_classification(n_samples=50, n_features=4, random_state=42)

n_splits = 3
n_search_iter = 30

params = [
Expand All @@ -1016,29 +1016,28 @@ def test_random_search_cv_results():
"mean_score_time",
"std_score_time",
)
n_cand = n_search_iter
n_candidates = n_search_iter

search = RandomizedSearchCV(
SVC(),
n_iter=n_search_iter,
cv=n_splits,
cv=3,
param_distributions=params,
return_train_score=True,
)
search.fit(X, y)
cv_results = search.cv_results_
# Check results structure
check_cv_results_array_types(search, param_keys, score_keys)
check_cv_results_keys(cv_results, param_keys, score_keys, n_cand)
n_candidates = len(search.cv_results_["params"])
check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
assert all(
(
cv_results["param_C"].mask[i]
and cv_results["param_gamma"].mask[i]
and not cv_results["param_degree"].mask[i]
)
for i in range(n_candidates)
if cv_results["param_kernel"][i] == "linear"
if cv_results["param_kernel"][i] == "poly"
)
assert all(
(
Expand Down
73 changes: 71 additions & 2 deletions sklearn/model_selection/tests/test_successive_halving.py
1E79
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy as np
import pytest
from scipy.stats import norm, randint
from scipy.stats import expon, norm, randint

from sklearn.datasets import make_classification
from sklearn.dummy import DummyClassifier
Expand All @@ -23,7 +23,11 @@
_SubsampleMetaSplitter,
_top_k,
)
from sklearn.svm import LinearSVC
from sklearn.model_selection.tests.test_search import (
check_cv_results_array_types,
check_cv_results_keys,
)
from sklearn.svm import SVC, LinearSVC


class FastClassifier(DummyClassifier):
Expand Down Expand Up @@ -777,3 +781,68 @@ def test_select_best_index(SearchCV):
# we expect the index of 'i'
best_index = SearchCV._select_best_index(None, None, results)
assert best_index == 8


def test_halving_random_search_list_of_dicts():
"""Check the behaviour of the `HalvingRandomSearchCV` with `param_distribution`
being a list of dictionary.
"""
X, y = make_classification(n_samples=150, n_features=4, random_state=42)

params = [
{"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)},
{"kernel": ["poly"], "degree": [2, 3]},
]
param_keys = (
"param_C",
"param_degree",
"param_gamma",
"param_kernel",
)
score_keys = (
"mean_test_score",
"mean_train_score",
"rank_test_score",
"split0_test_score",
"split1_test_score",
"split2_test_score",
"split0_train_score",
"split1_train_score",
"split2_train_score",
"std_test_score",
"std_train_score",
"mean_fit_time",
"std_fit_time",
"mean_score_time",
"std_score_time",
)
extra_keys = ("n_resources", "iter")

search = HalvingRandomSearchCV(
SVC(), cv=3, param_distributions=params, return_train_score=True, random_state=0
)
search.fit(X, y)
n_candidates = sum(search.n_candidates_)
cv_results = search.cv_results_
# Check results structure
check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys)
check_cv_results_array_types(search, param_keys, score_keys)

assert all(
(
cv_results["param_C"].mask[i]
and cv_results["param_gamma"].mask[i]
and not cv_results["param_degree"].mask[i]
)
for i in range(n_candidates)
if cv_results["param_kernel"][i] == "poly"
)
assert all(
(
not cv_results["param_C"].mask[i]
and not cv_results["param_gamma"].mask[i]
and cv_results["param_degree"].mask[i]
)
for i in range(n_candidates)
if cv_results["param_kernel"][i] == "rbf"
)
0