From 62db7fa7cd13237456f8a68985a60d415a2469ca Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Mon, 24 Jul 2023 23:36:23 +0200
Subject: [PATCH 1/6] HalvingRandomSearchCV with list format input and test
 attempted

---
 .../_search_successive_halving.py             |  8 +-
 sklearn/model_selection/tests/test_search.py  | 18 ++++-
 .../tests/test_successive_halving.py          | 78 ++++++++++++++++++-
 3 files changed, 97 insertions(+), 7 deletions(-)

diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py
index 38fe1e0e7a15c..708092d09a2a5 100644
--- a/sklearn/model_selection/_search_successive_halving.py
+++ b/sklearn/model_selection/_search_successive_halving.py
@@ -750,11 +750,13 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving):
         Either estimator needs to provide a ``score`` function,
         or ``scoring`` must be passed.
 
-    param_distributions : dict
-        Dictionary with parameters names (string) as keys and distributions
+    param_distributions : dict or list of dicts
+        Dictionary with parameters names (`str`) as keys and distributions
         or lists of parameters to try. Distributions must provide a ``rvs``
         method for sampling (such as those from scipy.stats.distributions).
         If a list is given, it is sampled uniformly.
+        If a list of dicts is given, first a dict is sampled uniformly, and
+        then a parameter is sampled using that dict as above.
 
     n_candidates : "exhaust" or int, default="exhaust"
         The number of candidate parameters to sample, at the first
@@ -1024,7 +1026,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving):
 
     _parameter_constraints: dict = {
         **BaseSuccessiveHalving._parameter_constraints,
-        "param_distributions": [dict],
+        "param_distributions": [dict, list],
         "n_candidates": [
             Interval(Integral, 0, None, closed="neither"),
             StrOptions({"exhaust"}),
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 6ea52049f3ced..23fc9d2bb4988 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -885,9 +885,21 @@ def test_param_sampler():
 def check_cv_results_array_types(search, param_keys, score_keys):
     # Check if the search `cv_results`'s array are of correct types
     cv_results = search.cv_results_
-    assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys)
-    assert all(cv_results[key].dtype == object for key in param_keys)
-    assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys)
+    assert all(
+        isinstance(cv_results[param], np.ma.MaskedArray)
+        for param in param_keys
+        if param not in {"iter", "n_resources"}
+    )
+    assert all(
+        cv_results[key].dtype == object
+        for key in param_keys
+        if key not in {"iter", "n_resources"}
+    )
+    assert not any(
+        isinstance(cv_results[key], np.ma.MaskedArray)
+        for key in score_keys
+        if key not in {"iter", "n_resources"}
+    )
     assert all(
         cv_results[key].dtype == np.float64
         for key in score_keys
diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py
index d4cc09ee01044..e098fda81c4c8 100644
--- a/sklearn/model_selection/tests/test_successive_halving.py
+++ b/sklearn/model_selection/tests/test_successive_halving.py
@@ -23,7 +23,11 @@
     _SubsampleMetaSplitter,
     _top_k,
 )
-from sklearn.svm import LinearSVC
+from sklearn.model_selection.tests.test_search import (
+    check_cv_results_array_types,
+    check_cv_results_keys,
+)
+from sklearn.svm import SVC, LinearSVC
 
 
 class FastClassifier(DummyClassifier):
@@ -777,3 +781,75 @@ def test_select_best_index(SearchCV):
     # we expect the index of 'i'
     best_index = SearchCV._select_best_index(None, None, results)
     assert best_index == 8
+
+
+def test_halving_grid_search_cv_results():
+    X, y = make_classification(n_samples=50, n_features=4, random_state=42)
+
+    n_splits = 3
+    n_proportion = 8
+
+    params = [
+        {"kernel": ["rbf"], "C": [0.01, 1.0], "gamma": ["scale", "auto"]},
+        {"kernel": ["poly"], "degree": [2, 3]},
+    ]
+    param_keys = (
+        "param_C",
+        "param_degree",
+        "param_gamma",
+        "param_kernel",
+        "iter",
+        "n_resources",
+    )
+    score_keys = (
+        "mean_test_score",
+        "mean_train_score",
+        "rank_test_score",
+        "split0_test_score",
+        "split1_test_score",
+        "split2_test_score",
+        "split0_train_score",
+        "split1_train_score",
+        "split2_train_score",
+        "std_test_score",
+        "std_train_score",
+        "mean_fit_time",
+        "std_fit_time",
+        "mean_score_time",
+        "std_score_time",
+    )
+    # n_cand = n_search_iter
+    n_cand = n_proportion
+
+    search = HalvingGridSearchCV(
+        SVC(),
+        cv=n_splits,
+        param_grid=params,
+        return_train_score=True,
+    )
+    search.fit(X, y)
+    cv_results = search.cv_results_
+    # Check results structure
+    check_cv_results_array_types(search, param_keys, score_keys)
+    check_cv_results_keys(cv_results, param_keys, score_keys, n_cand)
+    len(search.cv_results_["params"])
+
+
+"""    assert all(
+        (
+            not cv_results["param_C"].mask[i]
+            and not cv_results["param_gamma"].mask[i]
+            and not cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "poly"
+    )
+    assert all(
+        (
+            not cv_results["param_C"].mask[i]
+            and not cv_results["param_gamma"].mask[i]
+            and cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "rbf"
+    )"""

From 6c7e4cb13d1f58b9b0ad676a04c7c2ad01623d22 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Tue, 25 Jul 2023 00:59:03 +0200
Subject: [PATCH 2/6] swapped to HalvingRandomSearchCV, test outcommented, now
 fails

---
 sklearn/model_selection/tests/test_search.py  |  6 +----
 .../tests/test_successive_halving.py          | 24 +++++++------------
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 23fc9d2bb4988..54556a4761ae5 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -895,11 +895,7 @@ def check_cv_results_array_types(search, param_keys, score_keys):
         for key in param_keys
         if key not in {"iter", "n_resources"}
     )
-    assert not any(
-        isinstance(cv_results[key], np.ma.MaskedArray)
-        for key in score_keys
-        if key not in {"iter", "n_resources"}
-    )
+    assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys)
     assert all(
         cv_results[key].dtype == np.float64
         for key in score_keys
diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py
index e098fda81c4c8..b9b01e4dfe444 100644
--- a/sklearn/model_selection/tests/test_successive_halving.py
+++ b/sklearn/model_selection/tests/test_successive_halving.py
@@ -23,11 +23,7 @@
     _SubsampleMetaSplitter,
     _top_k,
 )
-from sklearn.model_selection.tests.test_search import (
-    check_cv_results_array_types,
-    check_cv_results_keys,
-)
-from sklearn.svm import SVC, LinearSVC
+from sklearn.svm import LinearSVC
 
 
 class FastClassifier(DummyClassifier):
@@ -783,14 +779,14 @@ def test_select_best_index(SearchCV):
     assert best_index == 8
 
 
-def test_halving_grid_search_cv_results():
+"""def test_halving_random_search_cv_results():
     X, y = make_classification(n_samples=50, n_features=4, random_state=42)
 
     n_splits = 3
-    n_proportion = 8
+    n_proportion = 6
 
     params = [
-        {"kernel": ["rbf"], "C": [0.01, 1.0], "gamma": ["scale", "auto"]},
+        {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)},
         {"kernel": ["poly"], "degree": [2, 3]},
     ]
     param_keys = (
@@ -818,13 +814,12 @@ def test_halving_grid_search_cv_results():
         "mean_score_time",
         "std_score_time",
     )
-    # n_cand = n_search_iter
     n_cand = n_proportion
 
-    search = HalvingGridSearchCV(
+    search = HalvingRandomSearchCV(
         SVC(),
         cv=n_splits,
-        param_grid=params,
+        param_distributions=params,
         return_train_score=True,
     )
     search.fit(X, y)
@@ -834,14 +829,13 @@ def test_halving_grid_search_cv_results():
     check_cv_results_keys(cv_results, param_keys, score_keys, n_cand)
     len(search.cv_results_["params"])
 
-
-"""    assert all(
+    assert all(
         (
             not cv_results["param_C"].mask[i]
             and not cv_results["param_gamma"].mask[i]
             and not cv_results["param_degree"].mask[i]
         )
-        for i in range(n_candidates)
+        for i in range(n_cand)
         if cv_results["param_kernel"][i] == "poly"
     )
     assert all(
@@ -850,6 +844,6 @@ def test_halving_grid_search_cv_results():
             and not cv_results["param_gamma"].mask[i]
             and cv_results["param_degree"].mask[i]
         )
-        for i in range(n_candidates)
+        for i in range(n_cand)
         if cv_results["param_kernel"][i] == "rbf"
     )"""

From 6227903b9f440c14988f98e144599fb2746876d6 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Wed, 26 Jul 2023 20:50:05 +0200
Subject: [PATCH 3/6] changes after review

---
 doc/whats_new/v1.3.rst                        |  4 ++
 sklearn/model_selection/tests/test_search.py  | 19 +++-------
 .../tests/test_successive_halving.py          | 37 +++++++++----------
 3 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 8d39ca2fed143..6ba517f4e24a9 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -23,6 +23,10 @@ Changelog
   :attr:`sklearn.neighbors.KDTree.valid_metrics` as public class attributes.
   :pr:`26754` by :user:`Julien Jerphanion <jjerphan>`.
 
+- |Fix| :class:`sklearn.model_selection.HalvingRandomSearchCV` no longer raises
+  when the input to the `param_distributions` parameter is a list of dicts.
+  :pr:`26893` by :user:`Stefanie Senger <StefanieSenger>`.
+
 .. _changes_1_3:
 
 Version 1.3.0
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 54556a4761ae5..5bdfa0e118571 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -885,16 +885,8 @@ def test_param_sampler():
 def check_cv_results_array_types(search, param_keys, score_keys):
     # Check if the search `cv_results`'s array are of correct types
     cv_results = search.cv_results_
-    assert all(
-        isinstance(cv_results[param], np.ma.MaskedArray)
-        for param in param_keys
-        if param not in {"iter", "n_resources"}
-    )
-    assert all(
-        cv_results[key].dtype == object
-        for key in param_keys
-        if key not in {"iter", "n_resources"}
-    )
+    assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys)
+    assert all(cv_results[key].dtype == object for key in param_keys)
     assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys)
     assert all(
         cv_results[key].dtype == np.float64
@@ -908,11 +900,10 @@ def check_cv_results_array_types(search, param_keys, score_keys):
         assert cv_results["rank_test_%s" % key].dtype == np.int32
 
 
-def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand):
+def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys=()):
     # Test the search.cv_results_ contains all the required results
-    assert_array_equal(
-        sorted(cv_results.keys()), sorted(param_keys + score_keys + ("params",))
-    )
+    all_keys = param_keys + score_keys + extra_keys
+    assert_array_equal(sorted(cv_results.keys()), sorted(all_keys + ("params",)))
     assert all(cv_results[key].shape == (n_cand,) for key in param_keys + score_keys)
 
 
diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py
index b9b01e4dfe444..cf9e467f1d716 100644
--- a/sklearn/model_selection/tests/test_successive_halving.py
+++ b/sklearn/model_selection/tests/test_successive_halving.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pytest
-from scipy.stats import norm, randint
+from scipy.stats import expon, norm, randint
 
 from sklearn.datasets import make_classification
 from sklearn.dummy import DummyClassifier
@@ -23,7 +23,11 @@
     _SubsampleMetaSplitter,
     _top_k,
 )
-from sklearn.svm import LinearSVC
+from sklearn.model_selection.tests.test_search import (
+    check_cv_results_array_types,
+    check_cv_results_keys,
+)
+from sklearn.svm import SVC, LinearSVC
 
 
 class FastClassifier(DummyClassifier):
@@ -779,11 +783,8 @@ def test_select_best_index(SearchCV):
     assert best_index == 8
 
 
-"""def test_halving_random_search_cv_results():
-    X, y = make_classification(n_samples=50, n_features=4, random_state=42)
-
-    n_splits = 3
-    n_proportion = 6
+def test_halving_random_search_cv_results():
+    X, y = make_classification(n_samples=150, n_features=4, random_state=42)
 
     params = [
         {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)},
@@ -794,8 +795,6 @@ def test_select_best_index(SearchCV):
         "param_degree",
         "param_gamma",
         "param_kernel",
-        "iter",
-        "n_resources",
     )
     score_keys = (
         "mean_test_score",
@@ -814,28 +813,26 @@ def test_select_best_index(SearchCV):
         "mean_score_time",
         "std_score_time",
     )
-    n_cand = n_proportion
+    extra_keys = ("n_resources", "iter")
 
     search = HalvingRandomSearchCV(
-        SVC(),
-        cv=n_splits,
-        param_distributions=params,
-        return_train_score=True,
+        SVC(), cv=3, param_distributions=params, return_train_score=True, random_state=0
     )
     search.fit(X, y)
+    n_candidates = sum(search.n_candidates_)
     cv_results = search.cv_results_
     # Check results structure
+    check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys)
     check_cv_results_array_types(search, param_keys, score_keys)
-    check_cv_results_keys(cv_results, param_keys, score_keys, n_cand)
     len(search.cv_results_["params"])
 
     assert all(
         (
-            not cv_results["param_C"].mask[i]
-            and not cv_results["param_gamma"].mask[i]
+            cv_results["param_C"].mask[i]
+            and cv_results["param_gamma"].mask[i]
             and not cv_results["param_degree"].mask[i]
         )
-        for i in range(n_cand)
+        for i in range(n_candidates)
         if cv_results["param_kernel"][i] == "poly"
     )
     assert all(
@@ -844,6 +841,6 @@ def test_select_best_index(SearchCV):
             and not cv_results["param_gamma"].mask[i]
             and cv_results["param_degree"].mask[i]
         )
-        for i in range(n_cand)
+        for i in range(n_candidates)
         if cv_results["param_kernel"][i] == "rbf"
-    )"""
+    )

From 98f941287e07d3be5a472ed48b1e15aff76325ef Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Thu, 27 Jul 2023 14:19:54 +0200
Subject: [PATCH 4/6] bug fix and cleanup in test_grid_search_cv_results and
 test_random_search_cv_results

---
 sklearn/model_selection/tests/test_search.py  | 32 +++++++++----------
 .../tests/test_successive_halving.py          |  1 -
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 5bdfa0e118571..9c828a3ea0075 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -910,7 +910,6 @@ def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys
 def test_grid_search_cv_results():
     X, y = make_classification(n_samples=50, n_features=4, random_state=42)
 
-    n_splits = 3
     n_grid_points = 6
     params = [
         dict(
@@ -948,9 +947,7 @@ def test_grid_search_cv_results():
     )
     n_candidates = n_grid_points
 
-    search = GridSearchCV(
-        SVC(), cv=n_splits, param_grid=params, return_train_score=True
-    )
+    search = GridSearchCV(SVC(), cv=3, param_grid=params, return_train_score=True)
     search.fit(X, y)
     cv_results = search.cv_results_
     # Check if score and timing are reasonable
@@ -966,17 +963,20 @@ def test_grid_search_cv_results():
     check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
     # Check masking
     cv_results = search.cv_results_
-    n_candidates = len(search.cv_results_["params"])
-    assert all(
+
+    poly_results = [
         (
             cv_results["param_C"].mask[i]
             and cv_results["param_gamma"].mask[i]
             and not cv_results["param_degree"].mask[i]
         )
         for i in range(n_candidates)
-        if cv_results["param_kernel"][i] == "linear"
-    )
-    assert all(
+        if cv_results["param_kernel"][i] == "poly"
+    ]
+    assert all(poly_results)
+    assert len(poly_results)
+
+    rbf_results = [
         (
             not cv_results["param_C"].mask[i]
             and not cv_results["param_gamma"].mask[i]
@@ -984,13 +984,14 @@ def test_grid_search_cv_results():
         )
         for i in range(n_candidates)
         if cv_results["param_kernel"][i] == "rbf"
-    )
+    ]
+    assert all(rbf_results)
+    assert len(rbf_results)
 
 
 def test_random_search_cv_results():
     X, y = make_classification(n_samples=50, n_features=4, random_state=42)
 
-    n_splits = 3
     n_search_iter = 30
 
     params = [
@@ -1015,12 +1016,12 @@ def test_random_search_cv_results():
         "mean_score_time",
         "std_score_time",
     )
-    n_cand = n_search_iter
+    n_candidates = n_search_iter
 
     search = RandomizedSearchCV(
         SVC(),
         n_iter=n_search_iter,
-        cv=n_splits,
+        cv=3,
         param_distributions=params,
         return_train_score=True,
     )
@@ -1028,8 +1029,7 @@ def test_random_search_cv_results():
     cv_results = search.cv_results_
     # Check results structure
     check_cv_results_array_types(search, param_keys, score_keys)
-    check_cv_results_keys(cv_results, param_keys, score_keys, n_cand)
-    n_candidates = len(search.cv_results_["params"])
+    check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
     assert all(
         (
             cv_results["param_C"].mask[i]
@@ -1037,7 +1037,7 @@ def test_random_search_cv_results():
             and not cv_results["param_degree"].mask[i]
         )
         for i in range(n_candidates)
-        if cv_results["param_kernel"][i] == "linear"
+        if cv_results["param_kernel"][i] == "poly"
     )
     assert all(
         (
diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py
index cf9e467f1d716..da5de99070246 100644
--- a/sklearn/model_selection/tests/test_successive_halving.py
+++ b/sklearn/model_selection/tests/test_successive_halving.py
@@ -824,7 +824,6 @@ def test_halving_random_search_cv_results():
     # Check results structure
     check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys)
     check_cv_results_array_types(search, param_keys, score_keys)
-    len(search.cv_results_["params"])
 
     assert all(
         (

From 4d90784dad25dae56a924c0d0b1fdd8692d78894 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Fri, 28 Jul 2023 15:22:04 +0200
Subject: [PATCH 5/6] specific length test

---
 sklearn/model_selection/tests/test_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 9c828a3ea0075..04c3f1f156fab 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -974,7 +974,7 @@ def test_grid_search_cv_results():
         if cv_results["param_kernel"][i] == "poly"
     ]
     assert all(poly_results)
-    assert len(poly_results)
+    assert len(poly_results) == 2
 
     rbf_results = [
         (
@@ -986,7 +986,7 @@ def test_grid_search_cv_results():
         if cv_results["param_kernel"][i] == "rbf"
     ]
     assert all(rbf_results)
-    assert len(rbf_results)
+    assert len(rbf_results) == 4
 
 
 def test_random_search_cv_results():

From 564ab3558eb45f1198f17f62653b05a682a9fcd2 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com>
Date: Fri, 4 Aug 2023 13:30:26 +0200
Subject: [PATCH 6/6] Update
 sklearn/model_selection/tests/test_successive_halving.py

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/model_selection/tests/test_successive_halving.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py
index da5de99070246..6c89f89afa684 100644
--- a/sklearn/model_selection/tests/test_successive_halving.py
+++ b/sklearn/model_selection/tests/test_successive_halving.py
@@ -783,7 +783,10 @@ def test_select_best_index(SearchCV):
     assert best_index == 8
 
 
-def test_halving_random_search_cv_results():
+def test_halving_random_search_list_of_dicts():
+    """Check the behaviour of the `HalvingRandomSearchCV` with `param_distribution`
+    being a list of dictionary.
+    """
     X, y = make_classification(n_samples=150, n_features=4, random_state=42)
 
     params = [