diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 877c6c26dc8c1..4c650e2ea84ce 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -42,7 +42,12 @@ random sampling procedures. :func:`linear_model._sgd_fast._plain_sgd` which is used by :class:`linear_model.SGDRegressor` and :class:`linear_model.SGDClassifier`. The old condition did not disambiguate between training and validation set and had an effect of overscaling the error tolerance. - This has been fixed in :pr:`23798` by :user:`Harsh Agrawal ` + This has been fixed in :pr:`23798` by :user:`Harsh Agrawal `. + +- |Fix| For :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` ranks corresponding to nan + scores will all be set to the maximum possible rank. + :pr:`24543` by :user:`Guillaume Lemaitre `. Changes impacting all modules ----------------------------- @@ -398,6 +403,11 @@ Changelog nan score is correctly set to the maximum possible rank, rather than `np.iinfo(np.int32).min`. :pr:`24141` by :user:`Loïc Estève `. +- |Fix| For :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` ranks corresponding to nan + scores will all be set to the maximum possible rank. + :pr:`24543` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.multioutput` .......................... diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 37b26eb1c72d3..6ccbae2abc611 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -965,13 +965,18 @@ def _store(key_name, array, weights=None, splits=False, rank=False): results["std_%s" % key_name] = array_stds if rank: - # when input is nan, scipy >= 1.10 rankdata returns nan. To - # keep previous behaviour nans are set to be smaller than the - # minimum value in the array before ranking - min_array_means = min(array_means) - 1 - array_means = np.nan_to_num(array_means, copy=True, nan=min_array_means) - rank_result = rankdata(-array_means, method="min") - rank_result = np.asarray(rank_result, dtype=np.int32) + # When the fit/scoring fails `array_means` contains NaNs, we + # will exclude them from the ranking process and consider them + # as tied with the worst performers. + if np.isnan(array_means).all(): + # All fit/scoring routines failed. + rank_result = np.ones_like(array_means, dtype=np.int32) + else: + min_array_means = np.nanmin(array_means) - 1 + array_means = np.nan_to_num(array_means, nan=min_array_means) + rank_result = rankdata(-array_means, method="min").astype( + np.int32, copy=False + ) results["rank_%s" % key_name] = rank_result _store("fit_time", out["fit_time"]) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index b86dfbd77846f..194a5d7ea3ca1 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1981,10 +1981,10 @@ def get_n_splits(self, *args, **kw): @pytest.mark.parametrize( "SearchCV, specialized_params", [ - (GridSearchCV, {"param_grid": {"max_depth": [2, 3]}}), + (GridSearchCV, {"param_grid": {"max_depth": [2, 3, 5, 8]}}), ( RandomizedSearchCV, - {"param_distributions": {"max_depth": [2, 3]}, "n_iter": 2}, + {"param_distributions": {"max_depth": [2, 3, 5, 8]}, "n_iter": 4}, ), ], ) @@ -2025,6 +2025,13 @@ def __call__(self, estimator, X, y): for msg, dataset in zip(warn_msg, set_with_warning): assert f"One or more of the {dataset} scores are non-finite" in str(msg.message) + # all non-finite scores should be equally ranked last + last_rank = grid.cv_results_["rank_test_score"].max() + non_finite_mask = np.isnan(grid.cv_results_["mean_test_score"]) + assert_array_equal(grid.cv_results_["rank_test_score"][non_finite_mask], last_rank) + # all finite scores should be better ranked than the non-finite scores + assert np.all(grid.cv_results_["rank_test_score"][~non_finite_mask] < last_rank) + def test_callable_multimetric_confusion_matrix(): # Test callable with many metrics inserts the correct names and metrics