thomasjpfan
diff --git a/‎doc/whats_new/v1.1.rst
Lines changed: 5 additions & 0 deletions b/‎doc/whats_new/v1.1.rst
Lines changed: 5 additions & 0 deletions
diff --git a/‎sklearn/preprocessing/_discretization.py
Lines changed: 66 additions & 1 deletion b/‎sklearn/preprocessing/_discretization.py
Lines changed: 66 additions & 1 deletion
diff --git a/‎sklearn/preprocessing/tests/test_discretization.py
Lines changed: 82 additions & 4 deletions b/‎sklearn/preprocessing/tests/test_discretization.py
Lines changed: 82 additions & 4 deletions
@@ -138,6 +138,11 @@ Changelog
 :mod:`sklearn.preprocessing`
 ............................
 
+- |Enhancement| Adds a `subsample` parameter to :class:`preprocessing.KBinsDiscretizer`.
+  This allows specifying a maximum number of samples to be used while fitting
+  the model. The option is only available when `strategy` is set to `quantile`.
+  :pr:`21445` by :user:`Felipe Bidu <fbidu>` and :user:`Amanda Dsouza <amy12xx>`.
+
 - |Fix| :class:`preprocessing.LabelBinarizer` now validates input parameters in `fit`
   instead of `__init__`.
   :pr:`21434` by :user:`Krum Arnaudov <krumeto>`.
 
@@ -15,7 +15,10 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
+from ..utils.validation import check_random_state
 from ..utils.validation import _check_feature_names_in
+from ..utils.validation import check_scalar
+from ..utils import _safe_indexing
 
 
 class KBinsDiscretizer(TransformerMixin, BaseEstimator):
@@ -63,6 +66,27 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
 
         .. versionadded:: 0.24
 
+    subsample : int or None (default='warn')
+        Maximum number of samples, used to fit the model, for computational
+        efficiency. Used when `strategy="quantile"`.
+        `subsample=None` means that all the training samples are used when
+        computing the quantiles that determine the binning thresholds.
+        Since quantile computation relies on sorting each column of `X` and
+        that sorting has an `n log(n)` time complexity,
+        it is recommended to use subsampling on datasets with a
+        very large number of samples.
+
+        .. deprecated:: 1.1
+           In version 1.3 and onwards, `subsample=2e5` will be the default.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for subsampling.
+        Pass an int for reproducible results across multiple function calls.
+        See the `subsample` parameter for more details.
+        See :term:`Glossary <random_state>`.
+
+        .. versionadded:: 1.1
+
     Attributes
     ----------
     bin_edges_ : ndarray of ndarray of shape (n_features,)
@@ -136,11 +160,22 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
            [ 0.5,  3.5, -1.5,  1.5]])
     """
 
-    def __init__(self, n_bins=5, *, encode="onehot", strategy="quantile", dtype=None):
+    def __init__(
+        self,
+        n_bins=5,
+        *,
+        encode="onehot",
+        strategy="quantile",
+        dtype=None,
+        subsample="warn",
+        random_state=None,
+    ):
         self.n_bins = n_bins
         self.encode = encode
         self.strategy = strategy
         self.dtype = dtype
+        self.subsample = subsample
+        self.random_state = random_state
 
     def fit(self, X, y=None):
         """
@@ -174,6 +209,36 @@ def fit(self, X, y=None):
                 " instead."
             )
 
+        n_samples, n_features = X.shape
+
+        if self.strategy == "quantile" and self.subsample is not None:
+            if self.subsample == "warn":
+                if n_samples > 2e5:
+                    warnings.warn(
+                        "In version 1.3 onwards, subsample=2e5 "
+                        "will be used by default. Set subsample explicitly to "
+                        "silence this warning in the mean time. Set "
+                        "subsample=None to disable subsampling explicitly.",
+                        FutureWarning,
+                    )
+            else:
+                self.subsample = check_scalar(
+                    self.subsample, "subsample", numbers.Integral, min_val=1
+                )
+                rng = check_random_state(self.random_state)
+                if n_samples > self.subsample:
+                    subsample_idx = rng.choice(
+                        n_samples, size=self.subsample, replace=False
+                    )
+                    X = _safe_indexing(X, subsample_idx)
+        elif self.strategy != "quantile" and isinstance(
+            self.subsample, numbers.Integral
+        ):
+            raise ValueError(
+                f"Invalid parameter for `strategy`: {self.strategy}. "
+                '`subsample` must be used with `strategy="quantile"`.'
+            )
+
         valid_encode = ("onehot", "onehot-dense", "ordinal")
         if self.encode not in valid_encode:
             raise ValueError(
 
@@ -3,6 +3,7 @@
 import scipy.sparse as sp
 import warnings
 
+from sklearn import clone
 from sklearn.preprocessing import KBinsDiscretizer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils._testing import (
@@ -37,16 +38,16 @@ def test_valid_n_bins():
 def test_invalid_n_bins():
     est = KBinsDiscretizer(n_bins=1)
     err_msg = (
-        "KBinsDiscretizer received an invalid "
-        "number of bins. Received 1, expected at least 2."
+        "KBinsDiscretizer received an invalid number of bins. Received 1, expected at"
+        " least 2."
     )
     with pytest.raises(ValueError, match=err_msg):
         est.fit_transform(X)
 
     est = KBinsDiscretizer(n_bins=1.1)
     err_msg = (
-        "KBinsDiscretizer received an invalid "
-        "n_bins type. Received float, expected int."
+        "KBinsDiscretizer received an invalid n_bins type. Received float, expected"
+        " int."
     )
     with pytest.raises(ValueError, match=err_msg):
         est.fit_transform(X)
@@ -357,3 +358,80 @@ def test_32_equal_64(input_dtype, encode):
     Xt_64 = kbd_64.transform(X_input)
 
     assert_allclose_dense_sparse(Xt_32, Xt_64)
+
+
+# FIXME: remove the `filterwarnings` in 1.3
+@pytest.mark.filterwarnings("ignore:In version 1.3 onwards, subsample=2e5")
+@pytest.mark.parametrize("subsample", [None, "warn"])
+def test_kbinsdiscretizer_subsample_default(subsample):
+    # Since the size of X is small (< 2e5), subsampling will not take place.
+    X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
+    kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
+    kbd_default.fit(X)
+
+    kbd_with_subsampling = clone(kbd_default)
+    kbd_with_subsampling.set_params(subsample=subsample)
+    kbd_with_subsampling.fit(X)
+
+    for bin_kbd_default, bin_kbd_with_subsampling in zip(
+        kbd_default.bin_edges_[0], kbd_with_subsampling.bin_edges_[0]
+    ):
+        np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling)
+    assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape
+
+
+def test_kbinsdiscretizer_subsample_invalid_strategy():
+    X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
+    kbd = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="uniform", subsample=3)
+
+    err_msg = '`subsample` must be used with `strategy="quantile"`.'
+    with pytest.raises(ValueError, match=err_msg):
+        kbd.fit(X)
+
+
+def test_kbinsdiscretizer_subsample_invalid_type():
+    X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
+    kbd = KBinsDiscretizer(
+        n_bins=10, encode="ordinal", strategy="quantile", subsample="full"
+    )
+
+    msg = (
+        "subsample must be an instance of <class 'numbers.Integral'>, not "
+        "<class 'str'>."
+    )
+    with pytest.raises(TypeError, match=msg):
+        kbd.fit(X)
+
+
+# TODO: Remove in 1.3
+def test_kbinsdiscretizer_subsample_warn():
+    X = np.random.rand(200001, 1).reshape(-1, 1)
+    kbd = KBinsDiscretizer(n_bins=100, encode="ordinal", strategy="quantile")
+
+    msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
+    with pytest.warns(FutureWarning, match=msg):
+        kbd.fit(X)
+
+
+@pytest.mark.parametrize("subsample", [0, int(2e5)])
+def test_kbinsdiscretizer_subsample_values(subsample):
+    X = np.random.rand(220000, 1).reshape(-1, 1)
+    kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
+
+    kbd_with_subsampling = clone(kbd_default)
+    kbd_with_subsampling.set_params(subsample=subsample)
+
+    if subsample == 0:
+        with pytest.raises(ValueError, match="subsample == 0, must be >= 1."):
+            kbd_with_subsampling.fit(X)
+    else:
+        # TODO: Remove in 1.3
+        msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
+        with pytest.warns(FutureWarning, match=msg):
+            kbd_default.fit(X)
+
+        kbd_with_subsampling.fit(X)
+        assert not np.all(
+            kbd_default.bin_edges_[0] == kbd_with_subsampling.bin_edges_[0]
+        )
+        assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape