diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 9d9cf3c95b450..25865665ccdc0 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -76,6 +76,22 @@ Changelog Setting a transformer to "passthrough" will pass the features unchanged. :pr:`20860` by :user:`Shubhraneel Pal `. +:mod:`sklearn.tests` +....................... + +- |Enhancement| Added a common test: :func:`test_negative_sample_weight_support`, + which checks all estimators with `sample_weight` parameter in `fit` + in `_tested_estimators()` in case of using negative + `sample_weight` in `fit`. Added a new tag: `"allow_negative_sample_weight"` + for the test. If the estimator supports the negative case + (default tag is `True`) there should be no error in case of negative values + in `sample_weight` in `fit`. Otherwise a given error have to be raised. + The following estimators raised other, misleading error. They are updated: + :class:`BayesianRidge`, :class:`KernelRidge`, + :class:`Ridge`, :class:`RidgeClassifier`, + :class:`ElasticNetCV`, :class:`LassoCV`. + :pr:`21132` by :user:`AndrĂ¡s Simon `. + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 08c2a97d0d7b0..a5e862ca5a715 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -575,6 +575,13 @@ def _sk_visual_block_(self): final_estimator = self.final_estimator return super()._sk_visual_block_(final_estimator) + def _more_tags(self): + for est in self.estimators: + if not est[1]._get_tags()["allow_negative_sample_weight"]: + return {"allow_negative_sample_weight": False} + else: + return {"allow_negative_sample_weight": True} + class StackingRegressor(RegressorMixin, _BaseStacking): """Stack of estimators with a final regressor. @@ -778,3 +785,10 @@ def _sk_visual_block_(self): else: final_estimator = self.final_estimator return super()._sk_visual_block_(final_estimator) + + def _more_tags(self): + for est in self.estimators: + if not est[1]._get_tags()["allow_negative_sample_weight"]: + return {"allow_negative_sample_weight": False} + else: + return {"allow_negative_sample_weight": True} diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index 99591cedead3d..20efbecc323fa 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -416,6 +416,13 @@ class labels predicted by each classifier. else: return self._predict(X) + def _more_tags(self): + for est in self.estimators: + if not est[1]._get_tags()["allow_negative_sample_weight"]: + return {"allow_negative_sample_weight": False} + else: + return {"allow_negative_sample_weight": True} + class VotingRegressor(RegressorMixin, _BaseVoting): """Prediction voting regressor for unfitted estimators. @@ -562,3 +569,10 @@ def transform(self, X): """ check_is_fitted(self) return self._predict(X) + + def _more_tags(self): + for est in self.estimators: + if not est[1]._get_tags()["allow_negative_sample_weight"]: + return {"allow_negative_sample_weight": False} + else: + return {"allow_negative_sample_weight": True} diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 8fd6cc0228137..21d2db67c4454 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -288,6 +288,9 @@ def feature_importances_(self): "feature_importances_ attribute" ) from e + def _more_tags(self): + return {"allow_negative_sample_weight": False} + def _samme_proba(estimator, n_classes, X): """Calculate algorithm 4, step 2, equation c) of Zhu et al [1]. diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index 19d55a2f86516..5aecc1c428fcc 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -155,7 +155,10 @@ def _get_kernel(self, X, Y=None): return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params) def _more_tags(self): - return {"pairwise": self.kernel == "precomputed"} + return { + "pairwise": self.kernel == "precomputed", + "allow_negative_sample_weight": False, + } # TODO: Remove in 1.1 # mypy error: Decorated property not supported @@ -192,7 +195,9 @@ def fit(self, X, y, sample_weight=None): X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True ) if sample_weight is not None and not isinstance(sample_weight, float): - sample_weight = _check_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight( + sample_weight, X, only_non_negative=True + ) K = self._get_kernel(X) alpha = np.atleast_1d(self.alpha) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 8b5102ecdd403..af9b6746220b3 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -724,6 +724,9 @@ def rmatvec(b): self._set_intercept(X_offset, y_offset, X_scale) return self + def _more_tags(self): + return {"allow_negative_sample_weight": False} + def _check_precomputed_gram_matrix( X, precompute, X_offset, X_scale, rtol=1e-7, atol=1e-5 diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index 9aeb97df1aaed..754f60335a2a1 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -240,7 +240,9 @@ def fit(self, X, y, sample_weight=None): X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True) if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype, only_non_negative=True + ) X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data( X, @@ -424,6 +426,9 @@ def _log_marginal_likelihood( return score + def _more_tags(self): + return {"allow_negative_sample_weight": False} + ############################################################################### # ARD (Automatic Relevance Determination) regression diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 91b6b1e584469..77c65f72a2cce 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -1583,7 +1583,9 @@ def fit(self, X, y, sample_weight=None): if sample_weight is not None: if sparse.issparse(X): raise ValueError("Sample weights do not (yet) support sparse matrices.") - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype, only_non_negative=True + ) model = self._get_estimator() @@ -1731,7 +1733,8 @@ def _more_tags(self): "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), - } + }, + "allow_negative_sample_weight": False, } diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index c565c8c6ce403..2139fbc59144f 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -615,5 +615,8 @@ def _more_tags(self): "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), - } + }, + "allow_negative_sample_weight": self.base_estimator._get_tags()[ + "allow_negative_sample_weight" + ], } diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 1dcc81e3b988f..d81b28df1bedf 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -749,7 +749,9 @@ def fit(self, X, y, sample_weight=None): solver = self.solver if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype, only_non_negative=True + ) # when X is sparse we only remove offset from y X, y, X_offset, y_offset, X_scale = self._preprocess_data( @@ -807,6 +809,9 @@ def fit(self, X, y, sample_weight=None): return self + def _more_tags(self): + return {"allow_negative_sample_weight": False} + class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): """Linear least squares with l2 regularization. @@ -1953,6 +1958,11 @@ def fit(self, X, y, sample_weight=None): cross-validation takes the sample weights into account when computing the validation score. """ + if sample_weight is not None: + sample_weight = _check_sample_weight( + sample_weight, X, only_non_negative=True + ) + cv = self.cv if cv is None: estimator = _RidgeGCV( @@ -2001,6 +2011,9 @@ def fit(self, X, y, sample_weight=None): return self + def _more_tags(self): + return {"allow_negative_sample_weight": False} + class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): """Ridge regression with built-in cross-validation. diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index ceb1df3420e38..f622b78b593de 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -238,7 +238,12 @@ def predict(self, X): return np.asarray(y).T def _more_tags(self): - return {"multioutput_only": True} + return { + "multioutput_only": True, + "allow_negative_sample_weight": self.estimator._get_tags()[ + "allow_negative_sample_weight" + ], + } class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator): diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 9ae25e611463f..d16d86332f134 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -327,5 +327,6 @@ def _more_tags(self): "check_sample_weights_invariance": ( "sample_weight must have positive values" ), - } + }, + "allow_negative_sample_weight": False, } diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 4f6818081c67d..e04ede1426b8d 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -37,6 +37,7 @@ from sklearn.pipeline import make_pipeline from sklearn.utils import IS_PYPY +from sklearn.utils.validation import has_fit_parameter from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags from sklearn.utils._testing import ( SkipTest, @@ -44,6 +45,7 @@ ) from sklearn.utils.estimator_checks import ( _construct_instance, + _create_data_for_fit, _set_checking_parameters, _get_check_estimator_ids, check_class_weight_balanced_linear_classifier, @@ -331,6 +333,30 @@ def test_check_n_features_in_after_fitting(estimator): check_n_features_in_after_fitting(estimator.__class__.__name__, estimator) +@pytest.mark.parametrize( + "estimator", _tested_estimators(), ids=_get_check_estimator_ids +) +def test_negative_sample_weight_support(estimator): + """ + In `_tested_estimators()`, where `fit` has `sample_weight` parameter + all (`1darray`, `2darray`) `X_types` tags are supported in + `_create_data_for_fit`. If new estimators will be added to this + list with other `X_types` tags, then the `_create_data_for_fit` + function have to be updated with the new cases. + """ + + if has_fit_parameter(estimator, "sample_weight"): + data = _create_data_for_fit(estimator=estimator, create_sample_weight=True) + # sample_weight was np.ones(30) + data["sample_weight"][0] = -1 + if not estimator._get_tags()["allow_negative_sample_weight"]: + err_msg = "Negative values in data passed to `sample_weight`" + with pytest.raises(ValueError, match=err_msg): + estimator.fit(**data) + else: + estimator.fit(**data) + + # NOTE: When running `check_dataframe_column_names_consistency` on a meta-estimator that # delegates validation to a base estimator, the check is testing that the base estimator # is checking for column name consistency. diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index a275c5dd1aa84..07b029f69455d 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -19,6 +19,7 @@ "preserves_dtype": [np.float64], "requires_y": False, "pairwise": False, + "allow_negative_sample_weight": True, } diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 37537bc1b0498..e7f38c60a09cb 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -45,6 +45,7 @@ from ..metrics import accuracy_score, adjusted_rand_score, f1_score from ..random_projection import BaseRandomProjection from ..feature_selection import SelectKBest +from ..multioutput import MultiOutputClassifier, MultiOutputRegressor from ..pipeline import make_pipeline from ..exceptions import DataConversionWarning from ..exceptions import NotFittedError @@ -398,6 +399,47 @@ def _construct_instance(Estimator): return estimator +def _create_data_for_fit(estimator, create_sample_weight=False): + """ + Supported `X_types` tags: `1darray`, `2darray` + + Parameters + ---------- + estimator : estimator instance generated by `_construct_instance`. + + create_sample_weight : bool, weather or not to add sample_weights + to the result dict + + Returns + ------- + data : dict, where estimator.fit(**data) can be used for testing + """ + instance_tags = estimator._get_tags() + + data = {} + if "2darray" in instance_tags["X_types"]: + data["X"] = np.ones((30, 2)) + elif "1darray" in instance_tags["X_types"]: + data["X"] = np.ones((30, 1)) + else: + raise ValueError("X_type not supported") + + if instance_tags["requires_y"]: + # 2 classes + if isinstance(estimator, MultiOutputClassifier): + # Y is capital only in this case + data["Y"] = np.concatenate((np.ones((15, 2)), 2 * np.ones((15, 2)))) + elif isinstance(estimator, MultiOutputRegressor): + data["y"] = np.concatenate((np.ones((15, 2)), 2 * np.ones((15, 2)))) + else: + data["y"] = np.concatenate((np.ones(15), 2 * np.ones(15))) + + if create_sample_weight: + data["sample_weight"] = np.ones(30) + + return data + + def _maybe_mark_xfail(estimator, check, pytest): # Mark (estimator, check) pairs as XFAIL if needed (see conditions in # _should_be_skipped_or_marked())