diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 935be46bba5af..1c6bd5ad5d107 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -371,6 +371,11 @@ Changelog then `get_output_feature_names` is not defined. :pr:`21569` by :user:`Aurélien Geron `. +- |Enhancement| Added support for `sample_weight` in :class:`preprocessing.KBinsDiscretizer`. + This allows specifying the parameter weights for each sample to be used while + fitting. The option is only available when `strategy` is set to `quantile`. + :pr:`22048` by :user:`Seladus `. + - |Fix| :class:`preprocessing.LabelBinarizer` now validates input parameters in `fit` instead of `__init__`. :pr:`21434` by :user:`Krum Arnaudov `. diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 5a382a8c93c47..ecae1cf9ee294 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -17,7 +17,9 @@ from ..utils.validation import check_is_fitted from ..utils.validation import check_random_state from ..utils.validation import _check_feature_names_in +from ..utils.validation import _check_sample_weight from ..utils.validation import check_scalar +from ..utils.stats import _weighted_percentile from ..utils import _safe_indexing @@ -171,7 +173,7 @@ def __init__( self.subsample = subsample self.random_state = random_state - def fit(self, X, y=None): + def fit(self, X, y=None, sample_weight=None): """ Fit the estimator. @@ -184,6 +186,9 @@ def fit(self, X, y=None): Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`. + sample_weight : ndarray of shape (n_samples,) + Contains weight values to be associated with each sample. + Returns ------- self : object @@ -205,6 +210,13 @@ def fit(self, X, y=None): n_samples, n_features = X.shape + valid_strategy = ("uniform", "quantile", "kmeans") + if self.strategy not in valid_strategy: + raise ValueError( + f"Valid options for 'strategy' are {valid_strategy}. " + f"Got strategy={self.strategy!r} instead." + ) + if self.strategy == "quantile" and self.subsample is not None: if self.subsample == "warn": if n_samples > 2e5: @@ -225,6 +237,7 @@ def fit(self, X, y=None): n_samples, size=self.subsample, replace=False ) X = _safe_indexing(X, subsample_idx) + elif self.strategy != "quantile" and isinstance( self.subsample, numbers.Integral ): @@ -233,6 +246,12 @@ def fit(self, X, y=None): '`subsample` must be used with `strategy="quantile"`.' ) + elif self.strategy != "quantile" and sample_weight is not None: + raise ValueError( + "`sample_weight` was provided but it can be only used with" + f"strategy='quantile'. Got strategy={self.strategy!r} instead." + ) + valid_encode = ("onehot", "onehot-dense", "ordinal") if self.encode not in valid_encode: raise ValueError( @@ -240,16 +259,15 @@ def fit(self, X, y=None): valid_encode, self.encode ) ) - valid_strategy = ("uniform", "quantile", "kmeans") - if self.strategy not in valid_strategy: - raise ValueError( - "Valid options for 'strategy' are {}. " - "Got strategy={!r} instead.".format(valid_strategy, self.strategy) - ) n_features = X.shape[1] n_bins = self._validate_n_bins(n_features) + if sample_weight is not None: + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype, copy=True + ) + bin_edges = np.zeros(n_features, dtype=object) for jj in range(n_features): column = X[:, jj] @@ -268,8 +286,16 @@ def fit(self, X, y=None): elif self.strategy == "quantile": quantiles = np.linspace(0, 100, n_bins[jj] + 1) - bin_edges[jj] = np.asarray(np.percentile(column, quantiles)) - + if sample_weight is None: + bin_edges[jj] = np.asarray(np.percentile(column, quantiles)) + else: + bin_edges[jj] = np.asarray( + [ + _weighted_percentile(column, sample_weight, q) + for q in quantiles + ], + dtype=np.float64, + ) elif self.strategy == "kmeans": from ..cluster import KMeans # fixes import loops diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index fa8240893f7c3..36c086f0efff3 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -16,16 +16,31 @@ @pytest.mark.parametrize( - "strategy, expected", + "strategy, expected, sample_weight", [ - ("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]), - ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]), - ("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]), + ("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]], None), + ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], None), + ("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], None), + ( + "quantile", + [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], + [1, 1, 2, 1], + ), + ( + "quantile", + [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], + [1, 1, 1, 1], + ), + ( + "quantile", + [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]], + [0, 1, 1, 1], + ), ], ) -def test_fit_transform(strategy, expected): +def test_fit_transform(strategy, expected, sample_weight): est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy) - est.fit(X) + est.fit(X, sample_weight=sample_weight) assert_array_equal(expected, est.transform(X)) @@ -53,6 +68,20 @@ def test_invalid_n_bins(): est.fit_transform(X) +def test_invalid_sample_weight(): + # sample_weight parameter is used with wrong strategy (other than quantile) + strategy = ["uniform", "kmeans"] + sample_weight = [1, 1, 1, 1] + for s in strategy: + est = KBinsDiscretizer(n_bins=3, strategy=s) + err_msg = ( + "`sample_weight` was provided but it can be only used with" + f"strategy='quantile'. Got strategy={s!r} instead." + ) + with pytest.raises(ValueError, match=err_msg): + est.fit_transform(X, sample_weight=sample_weight) + + def test_invalid_n_bins_array(): # Bad shape n_bins = np.full((2, 4), 2.0) @@ -92,17 +121,40 @@ def test_invalid_n_bins_array(): @pytest.mark.parametrize( - "strategy, expected", + "strategy, expected, sample_weight", [ - ("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]), - ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]), - ("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]]), + ("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]], None), + ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]], None), + ("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], None), + ( + "quantile", + [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], + [1, 1, 3, 1], + ), + ( + "quantile", + [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]], + [0, 1, 3, 1], + ), + # ( + # "quantile", + # [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], + # [1, 1, 1, 1], + # ), + # + # TODO: This test case above aims to test if the case where an array of + # ones passed in sample_weight parameter is equal to the case when + # sample_weight is None. + # Unfortunately, the behavior of `_weighted_percentile` when + # `sample_weight = [1, 1, 1, 1]` are currently not equivalent. + # This problem has been adressed in issue : + # https://github.com/scikit-learn/scikit-learn/issues/17370 ], ) -def test_fit_transform_n_bins_array(strategy, expected): +def test_fit_transform_n_bins_array(strategy, expected, sample_weight): est = KBinsDiscretizer( n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy - ).fit(X) + ).fit(X, sample_weight=sample_weight) assert_array_equal(expected, est.transform(X)) # test the shape of bin_edges_