10000 Fix a regression in GridSearchCV for parameter grids that have arrays… · scikit-learn/scikit-learn@ada571f · GitHub
[go: up one dir, main page]

Skip to content

Commit ada571f

Browse files
MarcoGorellilestevejeremiedbb
committed
Fix a regression in GridSearchCV for parameter grids that have arrays of different sizes as parameter values (#29314)
Co-authored-by: Loïc Estève <loic.esteve@ymail.com> Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
1 parent fb26476 commit ada571f

File tree

3 files changed

+172
-41
lines changed

3 files changed

+172
-41
lines changed

doc/whats_new/v1.5.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ Changelog
6363
grids that have estimators as parameter values.
6464
:pr:`29179` by :user:`Marco Gorelli<MarcoGorelli>`.
6565

66+
- |Fix| Fix a regression in :class:`model_selection.GridSearchCV` for parameter
67+
grids that have arrays of different sizes as parameter values.
68+
:pr:`29314` by :user:`Marco Gorelli<MarcoGorelli>`.
69+
6670
:mod:`sklearn.tree`
6771
...................
6872

sklearn/model_selection/_search.py

Lines changed: 52 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,56 @@ def check(self):
383383
return check
384384

385385

386+
def _yield_masked_array_for_each_param(candidate_params):
387+
"""
388+
Yield a masked array for each candidate param.
389+
390+
`candidate_params` is a sequence of params which were used in
391+
a `GridSearchCV`. We use masked arrays for the results, as not
392+
all params are necessarily present in each element of
393+
`candidate_params`. For example, if using `GridSearchCV` with
394+
a `SVC` model, then one might search over params like:
395+
396+
- kernel=["rbf"], gamma=[0.1, 1]
397+
- kernel=["poly"], degree=[1, 2]
398+
399+
and then param `'gamma'` would not be present in entries of
400+
`candidate_params` corresponding to `kernel='poly'`.
401+
"""
402+
n_candidates = len(candidate_params)
403+
param_results = defaultdict(dict)
404+
405+
for cand_idx, params in enumerate(candidate_params):
406+
for name, value in params.items():
407+
param_results["param_%s" % name][cand_idx] = value
408+
409+
for key, param_result in param_results.items():
410+
param_list = list(param_result.values())
411+
try:
412+
arr = np.array(param_list)
413+
except ValueError:
414+
# This can happen when param_list contains lists of different
415+
# lengths, for example:
416+
# param_list=[[1], [2, 3]]
417+
arr_dtype = np.dtype(object)
418+
else:
419+
# There are two cases when we don't use the automatically inferred
420+
# dtype when creating the array and we use object instead:
421+
# - string dtype
422+
# - when array.ndim > 1, that means that param_list was something
423+
# like a list of same-size sequences, which gets turned into a
424+
# multi-dimensional array but we want a 1d array
425+
arr_dtype = arr.dtype if arr.dtype.kind != "U" and arr.ndim == 1 else object
426+
427+
# Use one MaskedArray and mask all the places where the param is not
428+
# applicable for that candidate (which may not contain all the params).
429+
ma = MaskedArray(np.empty(n_candidates), mask=True, dtype=arr_dtype)
430+
for index, value in param_result.items():
431+
# Setting the value at an index unmasks that index
432+
ma[index] = value
433+
yield (key, ma)
434+
435+
386436
class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
387437
"""Abstract base class for hyper parameter search with cross-validation."""
388438

@@ -1082,45 +1132,9 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
10821132

10831133
_store("fit_time", out["fit_time"])
10841134
_store("score_time", out["score_time"])
1085-
param_results = defaultdict(dict)
1086-
for cand_idx, params in enumerate(candidate_params):
1087-
for name, value in params.items():
1088-
param_results["param_%s" % name][cand_idx] = value
1089-
for key, param_result in param_results.items():
1090-
param_list = list(param_result.values())
1091-
try:
1092-
with warnings.catch_warnings():
1093-
warnings.filterwarnings(
1094-
"ignore",
1095-
message="in the future the `.dtype` attribute",
1096-
category=DeprecationWarning,
1097-
)
1098-
# Warning raised by NumPy 1.20+
1099-
arr_dtype = np.result_type(*param_list)
1100-
except (TypeError, ValueError):
1101-
arr_dtype = np.dtype(object)
1102-
else:
1103-
if any(np.min_scalar_type(x) == object for x in param_list):
1104-
# `np.result_type` might get thrown off by `.dtype` properties
1105-
# (which some estimators have).
1106-
# If finding the result dtype this way would give object,
1107-
# then we use object.
1108-
# https://github.com/scikit-learn/scikit-learn/issues/29157
1109-
arr_dtype = np.dtype(object)
1110-
if len(param_list) == n_candidates and arr_dtype != object:
1111-
# Exclude `object` else the numpy constructor might infer a list of
1112-
# tuples to be a 2d array.
1113-
results[key] = MaskedArray(param_list, mask=False, dtype=arr_dtype)
1114-
else:
1115-
# Use one MaskedArray and mask all the places where the param is not
1116-
# applicable for that candidate (which may not contain all the params).
1117-
ma = MaskedArray(np.empty(n_candidates), mask=True, dtype=arr_dtype)
1118-
for index, value in param_result.items():
1119-
# Setting the value at an index unmasks that index
1120-
ma[index] = value
1121-
results[key] = ma
1122-
11231135
# Store a list of param dicts at the key 'params'
1136+
for param, ma in _yield_masked_array_for_each_param(candidate_params):
1137+
results[param] = ma
11241138
results["params"] = candidate_params
11251139

11261140
test_scores_dict = _normalize_score_results(out["test_scores"])

sklearn/model_selection/tests/test_search.py

Lines changed: 116 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,20 @@
6060
StratifiedShuffleSplit,
6161
train_test_split,
6262
)
63-
from sklearn.model_selection._search import BaseSearchCV
63+
from sklearn.model_selection._search import (
64+
BaseSearchCV,
65+
_yield_masked_array_for_each_param,
66+
)
6467
from sklearn.model_selection.tests.common import OneTimeSplitter
6568
from sklearn.naive_bayes import ComplementNB
6669
from sklearn.neighbors import KernelDensity, KNeighborsClassifier, LocalOutlierFactor
67-
from sklearn.pipeline import Pipeline
68-
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
70+
from sklearn.pipeline import Pipeline, make_pipeline
71+
from sklearn.preprocessing import (
72+
OneHotEncoder,
73+
OrdinalEncoder,
74+
SplineTransformer,
75+
StandardScaler,
76+
)
6977
from sklearn.svm import SVC, LinearSVC
7078
from sklearn.tests.metadata_routing_common import (
7179
ConsumingScorer,
@@ -2718,3 +2726,108 @@ def test_search_with_estimators_issue_29157():
27182726
grid_search = GridSearchCV(pipe, grid_params, cv=2)
27192727
grid_search.fit(X, y)
27202728
assert grid_search.cv_results_["param_enc__enc"].dtype == object
2729+
2730+
2731+
def test_cv_results_multi_size_array():
2732+
"""Check that GridSearchCV works with params that are arrays of different sizes.
2733+
2734+
Non-regression test for #29277.
2735+
"""
2736+
n_features = 10
2737+
X, y = make_classification(n_features=10)
2738+
2739+
spline_reg_pipe = make_pipeline(
2740+
SplineTransformer(extrapolation="periodic"),
2741+
LogisticRegression(),
2742+
)
2743+
2744+
n_knots_list = [n_features * i for i in [10, 11, 12]]
2745+
knots_list = [
2746+
np.linspace(0, np.pi * 2, n_knots).reshape((-1, n_features))
2747+
for n_knots in n_knots_list
2748+
]
2749+
spline_reg_pipe_cv = GridSearchCV(
2750+
estimator=spline_reg_pipe,
2751+
param_grid={
2752+
"splinetransformer__knots": knots_list,
2753+
},
2754+
)
2755+
2756+
spline_reg_pipe_cv.fit(X, y)
2757+
assert (
2758+
spline_reg_pipe_cv.cv_results_["param_splinetransformer__knots"].dtype == object
2759+
)
2760+
2761+
2762+
# Construct these outside the tests so that the same object is used
2763+
# for both input and `expected`
2764+
one_hot_encoder = OneHotEncoder()
2765+
ordinal_encoder = OrdinalEncoder()
2766+
2767+
# If we construct this directly via `MaskedArray`, the list of tuples
2768+
# gets auto-converted to a 2D array.
2769+
ma_with_tuples = np.ma.MaskedArray(np.empty(2), mask=True, dtype=object)
2770+
ma_with_tuples[0] = (1, 2)
2771+
ma_with_tuples[1] = (3, 4)
2772+
2773+
2774+
@pytest.mark.parametrize(
2775+
("candidate_params", "expected"),
2776+
[
2777+
pytest.param(
2778+
[{"foo": 1}, {"foo": 2}],
2779+
[
2780+
("param_foo", np.ma.MaskedArray(np.array([1, 2]))),
2781+
],
2782+
id="simple numeric, single param",
2783+
),
2784+
pytest.param(
2785+
[{"foo": 1, "bar": 3}, {"foo": 2, "bar": 4}, {"foo": 3}],
2786+
[
2787+
("param_foo", np.ma.MaskedArray(np.array([1, 2, 3]))),
2788+
(
2789+
"param_bar",
2790+
np.ma.MaskedArray(np.array([3, 4, 0]), mask=[False, False, True]),
2791+
),
2792+
],
2793+
id="simple numeric, one param is missing in one round",
2794+
),
2795+
pytest.param(
2796+
[{"foo": [[1], [2], [3]]}, {"foo": [[1], [2]]}],
2797+
[
2798+
(
2799+
"param_foo",
2800+
np.ma.MaskedArray([[[1], [2], [3]], [[1], [2]]], dtype=object),
2801+
),
2802+
],
2803+
id="lists of different lengths",
2804+
),
2805+
pytest.param(
2806+
[{"foo": (1, 2)}, {"foo": (3, 4)}],
2807+
[
2808+
(
2809+
"param_foo",
2810+
ma_with_tuples,
2811+
),
2812+
],
2813+
id="lists tuples",
2814+
),
2815+
pytest.param(
2816+
[{"foo": ordinal_encoder}, {"foo": one_hot_encoder}],
2817+
[
2818+
(
2819+
"param_foo",
2820+
np.ma.MaskedArray([ordinal_encoder, one_hot_encoder], dtype=object),
2821+
),
2822+
],
2823+
id="estimators",
2824+
),
2825+
],
2826+
)
2827+
def test_yield_masked_array_for_each_param(candidate_params, expected):
2828+
result = list(_yield_masked_array_for_each_param(candidate_params))
2829+
for (key, value), (expected_key, expected_value) in zip(result, expected):
2830+
assert key == expected_key
2831+
assert value.dtype == expected_value.dtype
2832+
np.testing.assert_array_equal(value, expected_value)
2833+
np.testing.assert_array_equal(value.mask, expected_value.mask)

0 commit comments

Comments
 (0)
0