scikit-learn · Seladus · Dec 21, 2021 · Dec 21, 2021 · Dec 21, 2021 · Dec 21, 2021
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -371,6 +371,11 @@ Changelog
   then `get_output_feature_names` is not defined.
   :pr:`21569` by :user:`Aurélien Geron <ageron>`.
 
+- |Enhancement| Added support for `sample_weight` in :class:`preprocessing.KBinsDiscretizer`.
+  This allows specifying the parameter weights for each sample to be used while 
+  fitting. The option is only available when `strategy` is set to `quantile`.
+  :pr:`22048` by :user:`Seladus <seladus>`.
+
 - |Fix| :class:`preprocessing.LabelBinarizer` now validates input parameters in
   `fit` instead of `__init__`.
   :pr:`21434` by :user:`Krum Arnaudov <krumeto>`.

diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
@@ -17,7 +17,9 @@
 from ..utils.validation import check_is_fitted
 from ..utils.validation import check_random_state
 from ..utils.validation import _check_feature_names_in
+from ..utils.validation import _check_sample_weight
 from ..utils.validation import check_scalar
+from ..utils.stats import _weighted_percentile
 from ..utils import _safe_indexing
 
 
@@ -171,7 +173,7 @@ def __init__(
         self.subsample = subsample
         self.random_state = random_state
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, sample_weight=None):
         """
         Fit the estimator.
 
@@ -184,6 +186,9 @@ def fit(self, X, y=None):
             Ignored. This parameter exists only for compatibility with
             :class:`~sklearn.pipeline.Pipeline`.
 
+        sample_weight : ndarray of shape (n_samples,)
+            Contains weight values to be associated with each sample.
+
         Returns
         -------
         self : object
@@ -205,6 +210,13 @@ def fit(self, X, y=None):
 
         n_samples, n_features = X.shape
 
+        valid_strategy = ("uniform", "quantile", "kmeans")
+        if self.strategy not in valid_strategy:
+            raise ValueError(
+                f"Valid options for 'strategy' are {valid_strategy}. "
+                f"Got strategy={self.strategy!r} instead."
+            )
+
         if self.strategy == "quantile" and self.subsample is not None:
             if self.subsample == "warn":
                 if n_samples > 2e5:
@@ -225,6 +237,7 @@ def fit(self, X, y=None):
                         n_samples, size=self.subsample, replace=False
                     )
                     X = _safe_indexing(X, subsample_idx)
+
         elif self.strategy != "quantile" and isinstance(
             self.subsample, numbers.Integral
         ):
@@ -233,23 +246,28 @@ def fit(self, X, y=None):
                 '`subsample` must be used with `strategy="quantile"`.'
             )
 
+        elif self.strategy != "quantile" and sample_weight is not None:
+            raise ValueError(
+                "`sample_weight` was provided but it can be only used with"
+                f"strategy='quantile'. Got strategy={self.strategy!r} instead."
+            )
+
         valid_encode = ("onehot", "onehot-dense", "ordinal")
         if self.encode not in valid_encode:
             raise ValueError(
                 "Valid options for 'encode' are {}. Got encode={!r} instead.".format(
                     valid_encode, self.encode
                 )
             )
-        valid_strategy = ("uniform", "quantile", "kmeans")
-        if self.strategy not in valid_strategy:
-            raise ValueError(
-                "Valid options for 'strategy' are {}. "
-                "Got strategy={!r} instead.".format(valid_strategy, self.strategy)
-            )
 
         n_features = X.shape[1]
         n_bins = self._validate_n_bins(n_features)
 
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(
+                sample_weight, X, dtype=X.dtype, copy=True
+            )
+
         bin_edges = np.zeros(n_features, dtype=object)
         for jj in range(n_features):
             column = X[:, jj]
@@ -268,8 +286,16 @@ def fit(self, X, y=None):
 
             elif self.strategy == "quantile":
                 quantiles = np.linspace(0, 100, n_bins[jj] + 1)
-                bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
-
+                if sample_weight is None:
+                    bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
+                else:
+                    bin_edges[jj] = np.asarray(
+                        [
+                            _weighted_percentile(column, sample_weight, q)
+                            for q in quantiles
+                        ],
+                        dtype=np.float64,
+                    )
             elif self.strategy == "kmeans":
                 from ..cluster import KMeans  # fixes import loops
 

diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py
@@ -16,16 +16,31 @@
 
 
 @pytest.mark.parametrize(
-    "strategy, expected",
+    "strategy, expected, sample_weight",
     [
-        ("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
-        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
-        ("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]),
+        ("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]], None),
+        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], None),
+        ("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], None),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
+            [1, 1, 2, 1],
+        ),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
+            [1, 1, 1, 1],
+        ),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
+            [0, 1, 1, 1],
+        ),
     ],
 )
-def test_fit_transform(strategy, expected):
+def test_fit_transform(strategy, expected, sample_weight):
     est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
-    est.fit(X)
+    est.fit(X, sample_weight=sample_weight)
     assert_array_equal(expected, est.transform(X))
 
 
@@ -53,6 +68,20 @@ def test_invalid_n_bins():
         est.fit_transform(X)
 
 
+def test_invalid_sample_weight():
+    # sample_weight parameter is used with wrong strategy (other than quantile)
+    strategy = ["uniform", "kmeans"]
+    sample_weight = [1, 1, 1, 1]
+    for s in strategy:
+        est = KBinsDiscretizer(n_bins=3, strategy=s)
+        err_msg = (
+            "`sample_weight` was provided but it can be only used with"
+            f"strategy='quantile'. Got strategy={s!r} instead."
+        )
+        with pytest.raises(ValueError, match=err_msg):
+            est.fit_transform(X, sample_weight=sample_weight)
+
+
 def test_invalid_n_bins_array():
     # Bad shape
     n_bins = np.full((2, 4), 2.0)
@@ -92,17 +121,40 @@ def test_invalid_n_bins_array():
 
 
 @pytest.mark.parametrize(
-    "strategy, expected",
+    "strategy, expected, sample_weight",
     [
-        ("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
-        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
-        ("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]]),
+        ("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]], None),
+        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]], None),
+        ("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], None),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
+            [1, 1, 3, 1],
+        ),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
+            [0, 1, 3, 1],
+        ),
+        # (
+        #     "quantile",
+        #     [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
+        #     [1, 1, 1, 1],
+        # ),
+        #
+        # TODO: This test case above aims to test if the case where an array of
+        #       ones passed in sample_weight parameter is equal to the case when
+        #       sample_weight is None.
+        #       Unfortunately, the behavior of `_weighted_percentile` when
+        #       `sample_weight = [1, 1, 1, 1]` are currently not equivalent.
+        #       This problem has been adressed in issue :
+        #       https://github.com/scikit-learn/scikit-learn/issues/17370
     ],
 )
-def test_fit_transform_n_bins_array(strategy, expected):
+def test_fit_transform_n_bins_array(strategy, expected, sample_weight):
     est = KBinsDiscretizer(
         n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
-    ).fit(X)
+    ).fit(X, sample_weight=sample_weight)
     assert_array_equal(expected, est.transform(X))
 
     # test the shape of bin_edges_