8000 ENH add support for sample_weight in KBinsDiscretizer(strategy="quantile") by Seladus · Pull Request #22048 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH add support for sample_weight in KBinsDiscretizer(strategy="quantile") #22048

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
3522d2d
Adding sample_weight parameter support in fit method of KBinsDiscretizer
Seladus Dec 21, 2021
4366bdc
Adding support for sample_weights in the case of an array-like n_bins
Seladus Dec 21, 2021
17e429b
Adding test cases for sample_weights parameter in fit method in KBins…
Seladus Dec 21, 2021
fe6076d
Black formatting and clarifications in documentation
Seladus Dec 21, 2021
e7d5003
Minor fix for PEP8 compatibility
Seladus Dec 21, 2021
85fe2b7
Adding fix to correct misunderstanding of the task and to make better…
Seladus Dec 23, 2021
83a0bba
Adding parameter copy=True in check for sample_weights + formatting w…
Seladus Dec 23, 2021
524cb73
Removing unused imports
Seladus Dec 23, 2021
dbe2eb0
Adding entry to the changelog
Seladus Dec 23, 2021
6bff3a3
Adding dtype in bin edges construction
Seladus Dec 23, 2021
ea5f446
Update sklearn/preprocessing/_discretization.py
Seladus Dec 23, 2021
e7e9eee
Application of suggestions : sooner check for valid strategy + interl…
Seladus Dec 23, 2021
e78c263
Movins subsample check for other strategy than qua 8000 ntile in its own if…
Seladus Dec 24, 2021
1a417f4
Change for linter
Seladus Dec 24, 2021
b5677e3
Merge branch 'main' into support_sample_weight_in_kbinsdiscretizer
Seladus Dec 31, 2021
b16321c
Update sklearn/preprocessing/_discretization.py
Seladus Jan 5, 2022
0d3919b
Update _discretization.py
Seladus Jan 5, 2022
30edabc
Adding TODO comment in tests and removing uselesss call to np.array
Seladus Jan 6, 2022
b600a0d
Changes for linter
Seladus Jan 6, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/v1.1.rst
8000
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,11 @@ Changelog
then `get_output_feature_names` is not defined.
:pr:`21569` by :user:`Aurélien Geron <ageron>`.

- |Enhancement| Added support for `sample_weight` in :class:`preprocessing.KBinsDiscretizer`.
This allows specifying the parameter weights for each sample to be used while
fitting. The option is only available when `strategy` is set to `quantile`.
:pr:`22048` by :user:`Seladus <seladus>`.

- |Fix| :class:`preprocessing.LabelBinarizer` now validates input parameters in
`fit` instead of `__init__`.
:pr:`21434` by :user:`Krum Arnaudov <krumeto>`.
Expand Down
44 changes: 35 additions & 9 deletions sklearn/preprocessing/_discretization.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
from ..utils.validation import check_is_fitted
from ..utils.validation import check_random_state
from ..utils.validation import _check_feature_names_in
from ..utils.validation import _check_sample_weight
from ..utils.validation import check_scalar
from ..utils.stats import _weighted_percentile
from ..utils import _safe_indexing


Expand Down Expand Up @@ -171,7 +173,7 @@ def __init__(
self.subsample = subsample
self.random_state = random_state

def fit(self, X, y=None):
def fit(self, X, y=None, sample_weight=None):
"""
Fit the estimator.

Expand All @@ -184,6 +186,9 @@ def fit(self, X, y=None):
Ignored. This parameter exists only for compatibility with
:class:`~sklearn.pipeline.Pipeline`.

sample_weight : ndarray of shape (n_samples,)
Contains weight values to be associated with each sample.

Returns
-------
self : object
Expand All @@ -205,6 +210,13 @@ def fit(self, X, y=None):

n_samples, n_features = X.shape

valid_strategy = ("uniform", "quantile", "kmeans")
if self.strategy not in valid_strategy:
raise ValueError(
f"Valid options for 'strategy' are {valid_strategy}. "
f"Got strategy={self.strategy!r} instead."
)

if self.strategy == "quantile" and self.subsample is not None:
if self.subsample == "warn":
if n_samples > 2e5:
Expand All @@ -225,6 +237,7 @@ def fit(self, X, y=None):
n_samples, size=self.subsample, replace=False
)
X = _safe_indexing(X, subsample_idx)

elif self.strategy != "quantile" and isinstance(
self.subsample, numbers.Integral
):
8000 Expand All @@ -233,23 +246,28 @@ def fit(self, X, y=None):
'`subsample` must be used with `strategy="quantile"`.'
)

elif self.strategy != "quantile" and sample_weight is not None:
raise ValueError(
"`sample_weight` was provided but it can be only used with"
f"strategy='quantile'. Got strategy={self.strategy!r} instead."
)

valid_encode = ("onehot", "onehot-dense", "ordinal")
if self.encode not in valid_encode:
raise ValueError(
"Valid options for 'encode' are {}. Got encode={!r} instead.".format(
valid_encode, self.encode
)
)
valid_strategy = ("uniform", "quantile", "kmeans")
if self.strategy not in valid_strategy:
raise ValueError(
"Valid options for 'strategy' are {}. "
"Got strategy={!r} instead.".format(valid_strategy, self.strategy)
)

n_features = X.shape[1]
n_bins = self._validate_n_bins(n_features)

if sample_weight is not None:
sample_weight = _check_sample_weight(
sample_weight, X, dtype=X.dtype, copy=True
)

bin_edges = np.zeros(n_features, dtype=object)
for jj in range(n_features):
column = X[:, jj]
Expand All @@ -268,8 +286,16 @@ def fit(self, X, y=None):

elif self.strategy == "quantile":
quantiles = np.linspace(0, 100, n_bins[jj] + 1)
bin_edges[jj] = np.asarray(np.percentile(column, quantiles))

if sample_weight is None:
bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
else:
bin_edges[jj] = np.asarray(
[
_weighted_percentile(column, sample_weight, q)
for q in quantiles
],
dtype=np.float64,
)
elif self.strategy == "kmeans":
from ..cluster import KMeans # fixes import loops

Expand Down
76 changes: 64 additions & 12 deletions sklearn/preprocessing/tests/test_discretization.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,31 @@


@pytest.mark.parametrize(
"strategy, expected",
"strategy, expected, sample_weight",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test will need to be updated with the change that I asked earlier.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the test cases that I previously pushed. Unfortunately, I'm doubting on their relevance. Would they be acceptable as is, or should I find more specific cases ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to have 2 specific checks that check the real behaviour of passing weights:

  • Check that passing None and an array of 1 will be equivalent.
  • Check that passing some zero weight would be equivalent to ignoring the sample and check the bins in these conditions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, I fail to understand why, the case where sample_weight=None for n_bins=[2, 3, 3, 3] and the case where sample_weight=[1, 1, 1, 1] for n_bins=[2, 3, 3, 3] are not equivalent (in test method test_fit_transform_n_bins_array).

The behaviors of np.percentileand _weighted_percentile with sample_weight = [1, 1, 1, 1] are supposed to be equivalent, aren't they ?

Copy link
Member
@ogrisel ogrisel Jan 5, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, I just checked:

>>> import numpy as np
>>> from sklearn.utils.stats import _weighted_percentile
>>> data = np.random.randn(100)
>>> np.percentile(data, [25, 50, 75])
array([-0.76701739,  0.0604021 ,  0.79485777])
>>> [_weighted_percentile(data, np.ones_like(data), p) for p in [25, 50, 75]]
[-0.7855144180141034, 0.05197426163774039, 0.7586492302901591]

It seems that this problem has been known for a while but not yet fixed:

Old attempts to fix this problem or related problems:

So for now, we can just comment out this case with a TODO comment with a link to #17370 for explain why this test is commented out.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I commented out the faulty test case in 30edabc (l. 139-151)

[
("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]),
("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]], None),
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], None),
("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], None),
(
"quantile",
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
[1, 1, 2, 1],
),
(
"quantile",
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
[1, 1, 1, 1],
),
(
"quantile",
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
[0, 1, 1, 1],
),
],
)
def test_fit_transform(strategy, expected):
def test_fit_transform(strategy, expected, sample_weight):
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
est.fit(X)
est.fit(X, sample_weight=sample_weight)
assert_array_equal(expected, est.transform(X))


Expand Down Expand Up @@ -53,6 +68,20 @@ def test_invalid_n_bins():
est.fit_transform(X)


def test_invalid_sample_weight():
# sample_weight parameter is used with wrong strategy (other than quantile)
strategy = ["uniform", "kmeans"]
sample_weight = [1, 1, 1, 1]
for s in strategy:
est = KBinsDiscretizer(n_bins=3, strategy=s)
err_msg = (
"`sample_weight` was provided but it can be only used with"
f"strategy='quantile'. Got strategy={s!r} instead."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X, sample_weight=sample_weight)


def test_invalid_n_bins_array():
# Bad shape
n_bins = np.full((2, 4), 2.0)
Expand Down Expand Up @@ -92,17 +121,40 @@ def test_invalid_n_bins_array():


@pytest.mark.parametrize(
"strategy, expected",
"strategy, expected, sample_weight",
[
("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]]),
("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]], None),
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]], None),
("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], None),
(
"quantile",
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
[1, 1, 3, 1],
),
(
"quantile",
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
[0, 1, 3, 1],
),
# (
# "quantile",
# [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
# [1, 1, 1, 1],
# ),
#
# TODO: This test case above aims to test if the case where an array of
# ones passed in sample_weight parameter is equal to the case when
# sample_weight is None.
# Unfortunately, the behavior of `_weighted_percentile` when
# `sample_weight = [1, 1, 1, 1]` are currently not equivalent.
# This problem has been adressed in issue :
# https://github.com/scikit-learn/scikit-learn/issues/17370
],
)
def test_fit_transform_n_bins_array(strategy, expected):
def test_fit_transform_n_bins_array(strategy, expected, sample_weight):
est = KBinsDiscretizer(
n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
).fit(X)
).fit(X, sample_weight=sample_weight)
assert_array_equal(expected, est.transform(X))

# test the shape of bin_edges_
Expand Down
0