10000 ENH add interpolation parameter to DummyRegressor for "median" and "quantile" strategies by glemaitre · Pull Request #17775 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH add interpolation parameter to DummyRegressor for "median" and "quantile" strategies #17775

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,15 @@ Changelog
redundant with the `dictionary` attribute and constructor parameter.
:pr:`17679` by :user:`Xavier Dupré <sdpython>`.

:mod:`sklearn.dummy`
....................

- |Enhancement| Add a parameter `interpolation` to
:class:`dummy.DummyRegressor` to choose the type of interpolation with the
strategy `median` and `quantile`. Beware that the interpolation will always
be `'linear'` with and without `sample_weight` in the future.
:pr:`17775` by :user:`Guillaume Lemaitre <glemaitre>`.

:mod:`sklearn.ensemble`
.......................

Expand Down Expand Up @@ -248,6 +257,15 @@ Changelog
:meth:`tree.DecisionTreeRegressor.fit`, and has not effect.
:pr:`17614` by :user:`Juan Carlos Alfaro Jiménez <alfaro96>`.

:mod:`sklearn.utils`
....................

- |Enhancement| :func:`sklearn.utils.stats._weighted_percentile` takes a new
parameter `interpolation` allowing to choose how to interpolate the
percentile value when it lies between two data points.
:pr:`17768` by :user:`Guillaume Lemaitre <glemaitre>` and
:user:`Michael Recachinas <mrecachinas>`.

Code and Documentation Contributors
-----------------------------------

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ addopts =
--ignore examples
--ignore maint_tools
--doctest-modules
--disable-pytest-warnings
# --disable-pytest-warnings
-rxXs

filterwarnings =
Expand Down
86 changes: 71 additions & 15 deletions sklearn/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,22 +410,42 @@ class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
* "constant": always predicts a constant value that is provided by
the user.

constant : int or float or array-like of shape (n_outputs,)
constant : int or float or array-like of shape (n_outputs,), default=None
The explicit constant as predicted by the "constant" strategy. This
parameter is useful only for the "constant" strategy.

quantile : float in [0.0, 1.0]
quantile : float, default=None
The quantile to predict using the "quantile" strategy. A quantile of
0.5 corresponds to the median, while 0.0 to the minimum and 1.0 to the
maximum.

interpolation : {"linear", "lower", "higher", "nearest"}, default=None
When `strategy="median"` or `strategy="quantile"`, this parameter is
the interpolation method to use when the desired median or quantile
lies between data points `i` and `j`:

* `"linear"`: `i + (j - i) * fraction`, where `fraction` is the
fractional part of the index surrounded by `i` and `j`;
* `"lower"`: i`;
* `"higher"`: `j`;
* `"nearest"`: `i` or `j`, whichever is nearest.

By default, if `sample_weight` is `None`, `interpolation="linear"`,
otherwise `interpolation="nearest"`.

.. versionadded: 0.24

.. versionchanged:: 0.24
`interpolation` will be `"linear"` whether the regressor is fitted
with or without `sample_weight` from 0.26.

Attributes
----------
constant_ : array, shape (1, n_outputs)
constant_ : array of shape (1, n_outputs)
Mean or median or quantile of the training targets or constant value
given by the user.

n_outputs_ : int,
n_outputs_ : int
Number of outputs.

Examples
Expand All @@ -443,10 +463,12 @@ class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
0.0
"""
@_deprecate_positional_args
def __init__(self, *, strategy="mean", constant=None, quantile=None):
def __init__(self, *, strategy="mean", constant=None, quantile=None,
interpolation=None):
self.strategy = strategy
self.constant = constant
self.quantile = quantile
self.interpolation = interpolation

def fit(self, X, y, sample_weight=None):
"""Fit the random regressor.
Expand Down Expand Up @@ -485,29 +507,63 @@ def fit(self, X, y, sample_weight=None):
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)

# FIXME: change the default interpolation to "linear" in 0.26
if self.strategy in ("median", "quantile"):
if sample_weight is not None:
if self.interpolation is None:
warnings.warn(
"From 0.26 and onward, interpolation will be 'linear' "
"by default when fitting with some sample weights. You"
" can force `interpolation='linear'` to get the new "
"behaviour and silence this warning.",
FutureWarning
)
interpolation = "nearest"
else:
interpolation = self.interpolation
else:
interpolation = (
"linear" if self.interpolation is None
else self.interpolation
)

if self.strategy == "mean":
self.constant_ = np.average(y, axis=0, weights=sample_weight)

elif self.strategy == "median":
if sample_weight is None:
self.constant_ = np.median(y, axis=0)
self.constant_ = np.percentile(
y, q=50.0, axis=0, interpolation=interpolation,
)
else:
self.constant_ = [_weighted_percentile(y[:, k], sample_weight,
percentile=50.)
for k in range(self.n_outputs_)]
self.constant_ = [
_weighted_percentile(
y[:, k], sample_weight, percentile=50.,
interpolation=interpolation,
)
for k in range(self.n_outputs_)
]

elif self.strategy == "quantile":
if self.quantile is None or not np.isscalar(self.quantile):
raise ValueError("Quantile must be a scalar in the range "
"[0.0, 1.0], but got %s." % self.quantile)
raise ValueError(
f"Quantile must be a scalar in the range [0.0, 1.0], "
f"but got {self.quantile}."
)

percentile = self.quantile * 100.0
if sample_weight is None:
self.constant_ = np.percentile(y, axis=0, q=percentile)
self.constant_ = np.percentile(
y, q=percentile, axis=0, interpolation=interpolation,
)
else:
self.constant_ = [_weighted_percentile(y[:, k], sample_weight,
percentile=percentile)
for k in range(self.n_outputs_)]
self.constant_ = [
_weighted_percentile(
y[:, k], sample_weight, percentile=percentile,
interpolation=interpolation,
)
for k in range(self.n_outputs_)
]

elif self.strategy == "constant":
if self.constant is None:
Expand Down
2 changes: 1 addition & 1 deletion sklearn/ensemble/tests/test_gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def check_regression_dataset(loss, subsample):

y_pred = reg.predict(X_reg)
mse = mean_squared_error(y_reg, y_pred)
assert mse < 0.04
assert mse < 0.05

if last_y_pred is not None:
# FIXME: We temporarily bypass this test. This is due to the fact
Expand Down
81 changes: 75 additions & 6 deletions sklearn/tests/test_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,12 +664,20 @@ def test_dummy_regressor_sample_weight(n_samples=10):
est = DummyRegressor(strategy="mean").fit(X, y, sample_weight)
assert est.constant_ == np.average(y, weights=sample_weight)

est = DummyRegressor(strategy="median").fit(X, y, sample_weight)
assert est.constant_ == _weighted_percentile(y, sample_weight, 50.)

est = DummyRegressor(strategy="quantile", quantile=.95).fit(X, y,
sample_weight)
assert est.constant_ == _weighted_percentile(y, sample_weight, 95.)
interpolation = "linear"
est = DummyRegressor(strategy="median", interpolation=interpolation)
est.fit(X, y, sample_weight)
assert est.constant_ == _weighted_percentile(
y, sample_weight, 50., interpolation=interpolation,
)

est = DummyRegressor(
strategy="quantile", quantile=.95, interpolation=interpolation,
)
est.fit(X, y, sample_weight)
assert est.constant_ == _weighted_percentile(
y, sample_weight, 95., interpolation=interpolation,
)


def test_dummy_regressor_on_3D_array():
Expand Down Expand Up @@ -764,3 +772,64 @@ def test_n_features_in_(Dummy):
assert not hasattr(d, 'n_features_in_')
d.fit(X, y)
assert d.n_features_in_ is None


@pytest.mark.filterwarnings("ignore:From 0.26 and onward, interpolation will")
@pytest.mark.parametrize(
"strategy, quantile", [("median", 0.5), ("quantile", 0.9)]
)
def test_dummy_regressor_default_legacy_behaviour(strategy, quantile):
# DummyRegressor will interpolate the following manner:
# * 'linear' if we are using np.median and np.percentile which is the case
# when `sample_weight` is None.
# * 'nearest' if we are using `_weighted_percentile` which is the case
# when `sample_weight` is not None.

rng = np.random.RandomState(seed=1)

n_samples = 100
X = [[0]] * n_samples
y = rng.rand(n_samples)
sample_weight = rng.rand(n_samples)

params = {"strategy": strategy, "quantile": quantile}
regressor = DummyRegressor(**params)
percentile = quantile * 100

regressor.fit(X, y)
assert regressor.constant_ == pytest.approx(
np.percentile(y, q=percentile, axis=0)
)

regressor.fit(X, y, sample_weight=sample_weight)
assert regressor.constant_ == pytest.approx(
_weighted_percentile(
y, sample_weight, percentile=percentile, interpolation="nearest",
)
)


@pytest.mark.parametrize(
"strategy, quantile", [("median", 0.5), ("quantile", 0.9)]
)
@pytest.mark.parametrize(
"interpolation, WarningType, expected_n_warnings",
[(None, FutureWarning, 1), ("linear", None, 0)]
)
def test_dummy_regressort_future_warning_interpolation(
strategy, quantile, interpolation, WarningType, expected_n_warnings,
):
rng = np.random.RandomState(seed=1)

n_samples = 100
X = [[0]] * n_samples
y = rng.rand(n_samples)
sample_weight = rng.rand(n_samples)

regressor = DummyRegressor(
strategy=strategy, quantile=quantile, interpolation=interpolation,
)

with pytest.warns(WarningType) as record:
regressor.fit(X, y, sample_weight=sample_weight)
assert len(record) == expected_n_warnings
Loading
0