scikit-learn · simonandras · Sep 23, 2021 · Sep 23, 2021 · Sep 23, 2021 · Sep 23, 2021
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -76,6 +76,22 @@ Changelog
   Setting a transformer to "passthrough" will pass the features unchanged.
   :pr:`20860` by :user:`Shubhraneel Pal <shubhraneel>`.
 
+:mod:`sklearn.tests`
+.......................
+
+- |Enhancement| Added a common test: :func:`test_negative_sample_weight_support`,
+  which checks all estimators with `sample_weight` parameter in `fit`
+  in `_tested_estimators()` in case of using negative
+  `sample_weight` in `fit`. Added a new tag: `"allow_negative_sample_weight"`
+  for the test. If the estimator supports the negative case
+  (default tag is `True`) there should be no error in case of negative values
+  in `sample_weight` in `fit`. Otherwise a given error have to be raised.
+  The following estimators raised other, misleading error. They are updated:
+  :class:`BayesianRidge`, :class:`KernelRidge`,
+  :class:`Ridge`, :class:`RidgeClassifier`,
+  :class:`ElasticNetCV`, :class:`LassoCV`.
+  :pr:`21132` by :user:`András Simon <simonandras>`.
+
 Code and Documentation Contributors
 -----------------------------------
 

diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
@@ -575,6 +575,13 @@ def _sk_visual_block_(self):
             final_estimator = self.final_estimator
         return super()._sk_visual_block_(final_estimator)
 
+    def _more_tags(self):
+        for est in self.estimators:
+            if not est[1]._get_tags()["allow_negative_sample_weight"]:
+                return {"allow_negative_sample_weight": False}
+        else:
+            return {"allow_negative_sample_weight": True}
+
 
 class StackingRegressor(RegressorMixin, _BaseStacking):
     """Stack of estimators with a final regressor.
@@ -778,3 +785,10 @@ def _sk_visual_block_(self):
         else:
             final_estimator = self.final_estimator
         return super()._sk_visual_block_(final_estimator)
+
+    def _more_tags(self):
+        for est in self.estimators:
+            if not est[1]._get_tags()["allow_negative_sample_weight"]:
+                return {"allow_negative_sample_weight": False}
+        else:
+            return {"allow_negative_sample_weight": True}
diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
@@ -416,6 +416,13 @@ class labels predicted by each classifier.
         else:
             return self._predict(X)
 
+    def _more_tags(self):
+        for est in self.estimators:
+            if not est[1]._get_tags()["allow_negative_sample_weight"]:
+                return {"allow_negative_sample_weight": False}
+        else:
+            return {"allow_negative_sample_weight": True}
+
 
 class VotingRegressor(RegressorMixin, _BaseVoting):
     """Prediction voting regressor for unfitted estimators.
@@ -562,3 +569,10 @@ def transform(self, X):
         """
         check_is_fitted(self)
         return self._predict(X)
+
+    def _more_tags(self):
+        for est in self.estimators:
+            if not est[1]._get_tags()["allow_negative_sample_weight"]:
+                return {"allow_negative_sample_weight": False}
+        else:
+            return {"allow_negative_sample_weight": True}
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
@@ -288,6 +288,9 @@ def feature_importances_(self):
                 "feature_importances_ attribute"
             ) from e
 
+    def _more_tags(self):
+        return {"allow_negative_sample_weight": False}
+
 
 def _samme_proba(estimator, n_classes, X):
     """Calculate algorithm 4, step 2, equation c) of Zhu et al [1].

diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
@@ -155,7 +155,10 @@ def _get_kernel(self, X, Y=None):
         return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params)
 
     def _more_tags(self):
-        return {"pairwise": self.kernel == "precomputed"}
+        return {
+            "pairwise": self.kernel == "precomputed",
+            "allow_negative_sample_weight": False,
+        }
 
     # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
@@ -192,7 +195,9 @@ def fit(self, X, y, sample_weight=None):
             X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
         )
         if sample_weight is not None and not isinstance(sample_weight, float):
-            sample_weight = _check_sample_weight(sample_weight, X)
+            sample_weight = _check_sample_weight(
+                sample_weight, X, only_non_negative=True
+            )
 
         K = self._get_kernel(X)
         alpha = np.atleast_1d(self.alpha)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
@@ -724,6 +724,9 @@ def rmatvec(b):
         self._set_intercept(X_offset, y_offset, X_scale)
         return self
 
+    def _more_tags(self):
+        return {"allow_negative_sample_weight": False}
+
 
 def _check_precomputed_gram_matrix(
     X, precompute, X_offset, X_scale, rtol=1e-7, atol=1e-5

diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
@@ -240,7 +240,9 @@ def fit(self, X, y, sample_weight=None):
         X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True)
 
         if sample_weight is not None:
-            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+            sample_weight = _check_sample_weight(
+                sample_weight, X, dtype=X.dtype, only_non_negative=True
+            )
 
         X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
             X,
@@ -424,6 +426,9 @@ def _log_marginal_likelihood(
 
         return score
 
+    def _more_tags(self):
+        return {"allow_negative_sample_weight": False}
+
 
 ###############################################################################
 # ARD (Automatic Relevance Determination) regression

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
@@ -1583,7 +1583,9 @@ def fit(self, X, y, sample_weight=None):
         if sample_weight is not None:
             if sparse.issparse(X):
                 raise ValueError("Sample weights do not (yet) support sparse matrices.")
-            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+            sample_weight = _check_sample_weight(
+                sample_weight, X, dtype=X.dtype, only_non_negative=True
+            )
 
         model = self._get_estimator()
 
@@ -1731,7 +1733,8 @@ def _more_tags(self):
                 "check_sample_weights_invariance": (
                     "zero sample_weight is not equivalent to removing samples"
                 ),
-            }
+            },
+            "allow_negative_sample_weight": False,
         }
 
 

diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
@@ -615,5 +615,8 @@ def _more_tags(self):
                 "check_sample_weights_invariance": (
                     "zero sample_weight is not equivalent to removing samples"
                 ),
-            }
+            },
+            "allow_negative_sample_weight": self.base_estimator._get_tags()[
+                "allow_negative_sample_weight"
+            ],
         }
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
@@ -749,7 +749,9 @@ def fit(self, X, y, sample_weight=None):
             solver = self.solver
 
         if sample_weight is not None:
-            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+            sample_weight = _check_sample_weight(
+                sample_weight, X, dtype=X.dtype, only_non_negative=True
+            )
 
         # when X is sparse we only remove offset from y
         X, y, X_offset, y_offset, X_scale = self._preprocess_data(
@@ -807,6 +809,9 @@ def fit(self, X, y, sample_weight=None):
 
         return self
 
+    def _more_tags(self):
+        return {"allow_negative_sample_weight": False}
+
 
 class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
     """Linear least squares with l2 regularization.
@@ -1953,6 +1958,11 @@ def fit(self, X, y, sample_weight=None):
         cross-validation takes the sample weights into account when computing
         the validation score.
         """
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(
+                sample_weight, X, only_non_negative=True
+            )
+
         cv = self.cv
         if cv is None:
             estimator = _RidgeGCV(
@@ -2001,6 +2011,9 @@ def fit(self, X, y, sample_weight=None):
 
         return self
 
+    def _more_tags(self):
+        return {"allow_negative_sample_weight": False}
+
 
 class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
     """Ridge regression with built-in cross-validation.

diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
@@ -238,7 +238,12 @@ def predict(self, X):
         return np.asarray(y).T
 
     def _more_tags(self):
-        return {"multioutput_only": True}
+        return {
+            "multioutput_only": True,
+            "allow_negative_sample_weight": self.estimator._get_tags()[
+                "allow_negative_sample_weight"
+            ],
+        }
 
 
 class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):

diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
@@ -327,5 +327,6 @@ def _more_tags(self):
                 "check_sample_weights_invariance": (
                     "sample_weight must have positive values"
                 ),
-            }
+            },
+            "allow_negative_sample_weight": False,
         }
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -37,13 +37,15 @@
 from sklearn.pipeline import make_pipeline
 
 from sklearn.utils import IS_PYPY
+from sklearn.utils.validation import has_fit_parameter
 from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags
 from sklearn.utils._testing import (
     SkipTest,
     set_random_state,
 )
 from sklearn.utils.estimator_checks import (
     _construct_instance,
+    _create_data_for_fit,
     _set_checking_parameters,
     _get_check_estimator_ids,
     check_class_weight_balanced_linear_classifier,
@@ -331,6 +333,30 @@ def test_check_n_features_in_after_fitting(estimator):
     check_n_features_in_after_fitting(estimator.__class__.__name__, estimator)
 
 
+@pytest.mark.parametrize(
+    "estimator", _tested_estimators(), ids=_get_check_estimator_ids
+)
+def test_negative_sample_weight_support(estimator):
+    """
+    In `_tested_estimators()`, where `fit` has `sample_weight` parameter
+    all (`1darray`, `2darray`) `X_types` tags are supported in
+    `_create_data_for_fit`. If new estimators will be added to this
+    list with other `X_types` tags, then the `_create_data_for_fit`
+    function have to be updated with the new cases.
+    """
+
+    if has_fit_parameter(estimator, "sample_weight"):
+        data = _create_data_for_fit(estimator=estimator, create_sample_weight=True)
+        # sample_weight was np.ones(30)
+        data["sample_weight"][0] = -1
+        if not estimator._get_tags()["allow_negative_sample_weight"]:
+            err_msg = "Negative values in data passed to `sample_weight`"
+            with pytest.raises(ValueError, match=err_msg):
+                estimator.fit(**data)
+        else:
+            estimator.fit(**data)
+
+
 # NOTE: When running `check_dataframe_column_names_consistency` on a meta-estimator that
 # delegates validation to a base estimator, the check is testing that the base estimator
 # is checking for column name consistency.

diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py
@@ -19,6 +19,7 @@
     "preserves_dtype": [np.float64],
     "requires_y": False,
     "pairwise": False,
+    "allow_negative_sample_weight": True,
 }
 
 

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -45,6 +45,7 @@
 from ..metrics import accuracy_score, adjusted_rand_score, f1_score
 from ..random_projection import BaseRandomProjection
 from ..feature_selection import SelectKBest
+from ..multioutput import MultiOutputClassifier, MultiOutputRegressor
 from ..pipeline import make_pipeline
 from ..exceptions import DataConversionWarning
 from ..exceptions import NotFittedError
@@ -398,6 +399,47 @@ def _construct_instance(Estimator):
     return estimator
 
 
+def _create_data_for_fit(estimator, create_sample_weight=False):
+    """
+    Supported `X_types` tags: `1darray`, `2darray`
+
+    Parameters
+    ----------
+    estimator : estimator instance generated by `_construct_instance`.
+
+    create_sample_weight : bool, weather or not to add sample_weights
+        to the result dict
+
+    Returns
+    -------
+    data : dict, where estimator.fit(**data) can be used for testing
+    """
+    instance_tags = estimator._get_tags()
+
+    data = {}
+    if "2darray" in instance_tags["X_types"]:
+        data["X"] = np.ones((30, 2))
+    elif "1darray" in instance_tags["X_types"]:
+        data["X"] = np.ones((30, 1))
+    else:
+        raise ValueError("X_type not supported")
+
+    if instance_tags["requires_y"]:
+        # 2 classes
+        if isinstance(estimator, MultiOutputClassifier):
+            # Y is capital only in this case
+            data["Y"] = np.concatenate((np.ones((15, 2)), 2 * np.ones((15, 2))))
+        elif isinstance(estimator, MultiOutputRegressor):
+            data["y"] = np.concatenate((np.ones((15, 2)), 2 * np.ones((15, 2))))
+        else:
+            data["y"] = np.concatenate((np.ones(15), 2 * np.ones(15)))
+
+    if create_sample_weight:
+        data["sample_weight"] = np.ones(30)
+
+    return data
+
+
 def _maybe_mark_xfail(estimator, check, pytest):
     # Mark (estimator, check) pairs as XFAIL if needed (see conditions in
     # _should_be_skipped_or_marked())