From 8c64bba6b0c98999e8a3772a87521d6d96bb8cc1 Mon Sep 17 00:00:00 2001 From: Dmitry Nesterov Date: Sat, 8 Apr 2023 13:45:01 +0300 Subject: [PATCH 1/4] MAINT Parameters validation for sklearn.model_selection.cross_validate --- sklearn/model_selection/_validation.py | 29 +++++++++++++++---- .../model_selection/tests/test_validation.py | 10 +++++-- sklearn/tests/test_public_functions.py | 1 + 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 7951417845bff..9dd5628f394b4 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -15,6 +15,7 @@ import numbers import time from functools import partial +from numbers import Real from traceback import format_exc from contextlib import suppress from collections import Counter @@ -29,6 +30,11 @@ from ..utils.validation import _num_samples from ..utils.parallel import delayed, Parallel from ..utils.metaestimators import _safe_split +from ..utils._param_validation import ( + HasMethods, + StrOptions, + validate_params, +) from ..metrics import check_scoring from ..metrics._scorer import _check_multimetric_scoring, _MultimetricScorer from ..exceptions import FitFailedWarning @@ -46,6 +52,24 @@ ] +@validate_params( + { + "estimator": [HasMethods("fit")], + "X": ["array-like"], + "y": ["array-like", None], + "groups": ["array-like", None], + "scoring": [str, callable, list, tuple, dict, None], + "cv": ["cv_object"], + "n_jobs": [int, None], + "verbose": ["verbose"], + "fit_params": [dict, None], + "pre_dispatch": [int, str], + "return_train_score": ["boolean"], + "return_estimator": ["boolean"], + "return_indices": ["boolean"], + "error_score": [StrOptions({"raise"}), Real], + } +) def cross_validate( estimator, X, @@ -141,11 +165,6 @@ def cross_validate( explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs - - An int, giving the exact number of total jobs that are spawned diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 1ed42d555309a..a2ea74dd909dd 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -355,7 +355,7 @@ def test_cross_validate_invalid_scoring_param(): cross_validate(estimator, X, y, scoring=[[make_scorer(precision_score)]]) error_message_regexp = ( - ".*scoring is invalid.*Refer to the scoring glossary for details:.*" + "The 'scoring' parameter of cross_validate must be .*. Got .* instead.*" ) # Empty dict should raise invalid scoring error @@ -2104,10 +2104,14 @@ def test_fit_and_score_failing(): "error_score must be the string 'raise' or a numeric value. (Hint: if " "using 'raise', please make sure that it has been spelled correctly.)" ) - with pytest.raises(ValueError, match=error_message): + + error_message_cross_validate = ( + "The 'error_score' parameter of cross_validate must be .*. Got .* instead." + ) + with pytest.raises(ValueError, match=error_message_cross_validate): cross_validate(failing_clf, X, cv=3, error_score="unvalid-string") - with pytest.raises(ValueError, match=error_message): + with pytest.raises(ValueError, match=error_message_cross_validate): cross_val_score(failing_clf, X, cv=3, error_score="unvalid-string") with pytest.raises(ValueError, match=error_message): diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py index a4a9dbd9db739..34951942aace9 100644 --- a/sklearn/tests/test_public_functions.py +++ b/sklearn/tests/test_public_functions.py @@ -231,6 +231,7 @@ def _check_function_param_validation( "sklearn.metrics.top_k_accuracy_score", "sklearn.metrics.zero_one_loss", "sklearn.model_selection.train_test_split", + "sklearn.model_selection.cross_validate", "sklearn.preprocessing.add_dummy_feature", "sklearn.preprocessing.binarize", "sklearn.preprocessing.label_binarize", From 5ea24c5f4723f51430e1ca886fe0db96e504bfde Mon Sep 17 00:00:00 2001 From: Dmitry Nesterov Date: Wed, 19 Apr 2023 11:18:09 +0300 Subject: [PATCH 2/4] Added sparse test and validation for cross_validate Added sparse type for X matrix in cross_validate and additional fixes in parameters validation. --- sklearn/model_selection/_validation.py | 19 ++++++++++++++----- .../model_selection/tests/test_validation.py | 19 ++++++++++++++++--- sklearn/tests/test_public_functions.py | 2 +- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 9dd5628f394b4..d3b08169bc058 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -32,10 +32,12 @@ from ..utils.metaestimators import _safe_split from ..utils._param_validation import ( HasMethods, + Integral, StrOptions, validate_params, ) from ..metrics import check_scoring +from ..metrics import get_scorer_names from ..metrics._scorer import _check_multimetric_scoring, _MultimetricScorer from ..exceptions import FitFailedWarning from ._split import check_cv @@ -55,15 +57,22 @@ @validate_params( { "estimator": [HasMethods("fit")], - "X": ["array-like"], + "X": ["array-like", "sparse matrix"], "y": ["array-like", None], "groups": ["array-like", None], - "scoring": [str, callable, list, tuple, dict, None], + "scoring": [ + StrOptions(set(get_scorer_names())), + callable, + list, + tuple, + dict, + None, + ], "cv": ["cv_object"], - "n_jobs": [int, None], + "n_jobs": [Integral, None], "verbose": ["verbose"], "fit_params": [dict, None], - "pre_dispatch": [int, str], + "pre_dispatch": [Integral, str], "return_train_score": ["boolean"], "return_estimator": ["boolean"], "return_indices": ["boolean"], @@ -96,7 +105,7 @@ def cross_validate( estimator : estimator object implementing 'fit' The object to use to fit the data. - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The data to fit. Can be for example a list, or an array. y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index a2ea74dd909dd..ea69fa032e27c 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -382,7 +382,7 @@ def test_cross_validate_invalid_scoring_param(): with pytest.warns(UserWarning, match=warning_message): cross_validate(estimator, X, y, scoring={"foo": multiclass_scorer}) - with pytest.raises(ValueError, match="'mse' is not a valid scoring value."): + with pytest.raises(ValueError, match=error_message_regexp): cross_validate(SVC(), X, y, scoring="mse") @@ -405,7 +405,8 @@ def test_cross_validate_nested_estimator(): assert all(isinstance(estimator, Pipeline) for estimator in estimators) -def test_cross_validate(): +@pytest.mark.parametrize("use_sparse", [False, True]) +def test_cross_validate(use_sparse: bool): # Compute train and test mse/r2 scores cv = KFold() @@ -417,6 +418,10 @@ def test_cross_validate(): X_clf, y_clf = make_classification(n_samples=30, random_state=0) clf = SVC(kernel="linear", random_state=0) + if use_sparse: + X_reg = csr_matrix(X_reg) + X_clf = csr_matrix(X_clf) + for X, y, est in ((X_reg, y_reg, reg), (X_clf, y_clf, clf)): # It's okay to evaluate regression metrics on classification too mse_scorer = check_scoring(est, scoring="neg_mean_squared_error") @@ -510,7 +515,15 @@ def check_cross_validate_single_metric(clf, X, y, scores, cv): clf, X, y, scoring="neg_mean_squared_error", return_estimator=True, cv=cv ) for k, est in enumerate(mse_scores_dict["estimator"]): - assert_almost_equal(est.coef_, fitted_estimators[k].coef_) + est_coef = est.coef_.copy() + if isinstance(est_coef, (csr_matrix, coo_matrix)): + est_coef = est_coef.toarray() + + fitted_est_coef = fitted_estimators[k].coef_.copy() + if isinstance(fitted_est_coef, (csr_matrix, coo_matrix)): + fitted_est_coef = fitted_est_coef.toarray() + + assert_almost_equal(est_coef, fitted_est_coef) assert_almost_equal(est.intercept_, fitted_estimators[k].intercept_) diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py index 34951942aace9..dc517e81b26da 100644 --- a/sklearn/tests/test_public_functions.py +++ b/sklearn/tests/test_public_functions.py @@ -230,8 +230,8 @@ def _check_function_param_validation( "sklearn.metrics.roc_curve", "sklearn.metrics.top_k_accuracy_score", "sklearn.metrics.zero_one_loss", - "sklearn.model_selection.train_test_split", "sklearn.model_selection.cross_validate", + "sklearn.model_selection.train_test_split", "sklearn.preprocessing.add_dummy_feature", "sklearn.preprocessing.binarize", "sklearn.preprocessing.label_binarize", From 98b9eb823fcd310cd7a7c69ef687e87934a953f3 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Wed, 19 Apr 2023 18:21:36 +0200 Subject: [PATCH 3/4] cln tests now covered by common tests --- sklearn/model_selection/tests/test_validation.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index ea69fa032e27c..881ee909724de 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -354,18 +354,10 @@ def test_cross_validate_invalid_scoring_param(): with pytest.raises(ValueError, match=error_message_regexp): cross_validate(estimator, X, y, scoring=[[make_scorer(precision_score)]]) - error_message_regexp = ( - "The 'scoring' parameter of cross_validate must be .*. Got .* instead.*" - ) - # Empty dict should raise invalid scoring error with pytest.raises(ValueError, match="An empty dict"): cross_validate(estimator, X, y, scoring=(dict())) - # And so should any other invalid entry - with pytest.raises(ValueError, match=error_message_regexp): - cross_validate(estimator, X, y, scoring=5) - multiclass_scorer = make_scorer(precision_recall_fscore_support) # Multiclass Scorers that return multiple values are not supported yet @@ -382,9 +374,6 @@ def test_cross_validate_invalid_scoring_param(): with pytest.warns(UserWarning, match=warning_message): cross_validate(estimator, X, y, scoring={"foo": multiclass_scorer}) - with pytest.raises(ValueError, match=error_message_regexp): - cross_validate(SVC(), X, y, scoring="mse") - def test_cross_validate_nested_estimator(): # Non-regression test to ensure that nested @@ -2121,8 +2110,6 @@ def test_fit_and_score_failing(): error_message_cross_validate = ( "The 'error_score' parameter of cross_validate must be .*. Got .* instead." ) - with pytest.raises(ValueError, match=error_message_cross_validate): - cross_validate(failing_clf, X, cv=3, error_score="unvalid-string") with pytest.raises(ValueError, match=error_message_cross_validate): cross_val_score(failing_clf, X, cv=3, error_score="unvalid-string") From 0d2f6dea8fd2182d6997f4a0214a8c41d657618c Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Wed, 19 Apr 2023 18:27:29 +0200 Subject: [PATCH 4/4] nitpick --- sklearn/model_selection/tests/test_validation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 881ee909724de..2c275af617e40 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -10,6 +10,7 @@ import pytest import numpy as np from scipy.sparse import coo_matrix, csr_matrix +from scipy.sparse import issparse from sklearn.exceptions import FitFailedWarning from sklearn.model_selection.tests.test_search import FailingClassifier @@ -505,11 +506,11 @@ def check_cross_validate_single_metric(clf, X, y, scores, cv): ) for k, est in enumerate(mse_scores_dict["estimator"]): est_coef = est.coef_.copy() - if isinstance(est_coef, (csr_matrix, coo_matrix)): + if issparse(est_coef): est_coef = est_coef.toarray() fitted_est_coef = fitted_estimators[k].coef_.copy() - if isinstance(fitted_est_coef, (csr_matrix, coo_matrix)): + if issparse(fitted_est_coef): fitted_est_coef = fitted_est_coef.toarray() assert_almost_equal(est_coef, fitted_est_coef)