scikit-learn · glemaitre · Jun 29, 2020 · Jun 29, 2020 · Jun 29, 2020 · Jun 29, 2020
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -76,6 +76,15 @@ Changelog
   redundant with the `dictionary` attribute and constructor parameter.
   :pr:`17679` by :user:`Xavier Dupré <sdpython>`.
 
+:mod:`sklearn.dummy`
+....................
+
+- |Enhancement| Add a parameter `interpolation` to
+  :class:`dummy.DummyRegressor` to choose the type of interpolation with the
+  strategy `median` and `quantile`. Beware that the interpolation will always
+  be `'linear'` with and without `sample_weight` in the future.
+  :pr:`17775` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.ensemble`
 .......................
 
@@ -248,6 +257,15 @@ Changelog
   :meth:`tree.DecisionTreeRegressor.fit`, and has not effect.
   :pr:`17614` by :user:`Juan Carlos Alfaro Jiménez <alfaro96>`.
 
+:mod:`sklearn.utils`
+....................
+
+- |Enhancement| :func:`sklearn.utils.stats._weighted_percentile` takes a new
+  parameter `interpolation` allowing to choose how to interpolate the
+  percentile value when it lies between two data points.
+  :pr:`17768` by :user:`Guillaume Lemaitre <glemaitre>` and
+  :user:`Michael Recachinas <mrecachinas>`.
+
 Code and Documentation Contributors
 -----------------------------------
 

diff --git a/setup.cfg b/setup.cfg
@@ -12,7 +12,7 @@ addopts =
     --ignore examples
     --ignore maint_tools
     --doctest-modules
-    --disable-pytest-warnings
+    # --disable-pytest-warnings
     -rxXs
 
 filterwarnings =

diff --git a/sklearn/dummy.py b/sklearn/dummy.py
@@ -410,22 +410,42 @@ class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         * "constant": always predicts a constant value that is provided by
           the user.
 
-    constant : int or float or array-like of shape (n_outputs,)
+    constant : int or float or array-like of shape (n_outputs,), default=None
         The explicit constant as predicted by the "constant" strategy. This
         parameter is useful only for the "constant" strategy.
 
-    quantile : float in [0.0, 1.0]
+    quantile : float, default=None
         The quantile to predict using the "quantile" strategy. A quantile of
         0.5 corresponds to the median, while 0.0 to the minimum and 1.0 to the
         maximum.
 
+    interpolation : {"linear", "lower", "higher", "nearest"}, default=None
+        When `strategy="median"` or `strategy="quantile"`, this parameter is
+        the interpolation method to use when the desired median or quantile
+        lies between data points `i` and `j`:
+
+        * `"linear"`: `i + (j - i) * fraction`, where `fraction` is the
+          fractional part of the index surrounded by `i` and `j`;
+        * `"lower"`: i`;
+        * `"higher"`: `j`;
+        * `"nearest"`: `i` or `j`, whichever is nearest.
+
+        By default, if `sample_weight` is `None`, `interpolation="linear"`,
+        otherwise `interpolation="nearest"`.
+
+        .. versionadded: 0.24
+
+        .. versionchanged:: 0.24
+           `interpolation` will be `"linear"` whether the regressor is fitted
+           with or without `sample_weight` from 0.26.
+
     Attributes
     ----------
-    constant_ : array, shape (1, n_outputs)
+    constant_ : array of shape (1, n_outputs)
         Mean or median or quantile of the training targets or constant value
         given by the user.
 
-    n_outputs_ : int,
+    n_outputs_ : int
         Number of outputs.
 
     Examples
@@ -443,10 +463,12 @@ class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     0.0
     """
     @_deprecate_positional_args
-    def __init__(self, *, strategy="mean", constant=None, quantile=None):
+    def __init__(self, *, strategy="mean", constant=None, quantile=None,
+                 interpolation=None):
         self.strategy = strategy
         self.constant = constant
         self.quantile = quantile
+        self.interpolation = interpolation
 
     def fit(self, X, y, sample_weight=None):
         """Fit the random regressor.
@@ -485,29 +507,63 @@ def fit(self, X, y, sample_weight=None):
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X)
 
+        # FIXME: change the default interpolation to "linear" in 0.26
+        if self.strategy in ("median", "quantile"):
+            if sample_weight is not None:
+                if self.interpolation is None:
+                    warnings.warn(
+                        "From 0.26 and onward, interpolation will be 'linear' "
+                        "by default when fitting with some sample weights. You"
+                        " can force `interpolation='linear'` to get the new "
+                        "behaviour and silence this warning.",
+                        FutureWarning
+                    )
+                    interpolation = "nearest"
+                else:
+                    interpolation = self.interpolation
+            else:
+                interpolation = (
+                    "linear" if self.interpolation is None
+                    else self.interpolation
+                )
+
         if self.strategy == "mean":
             self.constant_ = np.average(y, axis=0, weights=sample_weight)
 
         elif self.strategy == "median":
             if sample_weight is None:
-                self.constant_ = np.median(y, axis=0)
+                self.constant_ = np.percentile(
+                    y, q=50.0, axis=0, interpolation=interpolation,
+                )
             else:
-                self.constant_ = [_weighted_percentile(y[:, k], sample_weight,
-                                                       percentile=50.)
-                                  for k in range(self.n_outputs_)]
+                self.constant_ = [
+                    _weighted_percentile(
+                        y[:, k], sample_weight, percentile=50.,
+                        interpolation=interpolation,
+                    )
+                    for k in range(self.n_outputs_)
+                ]
 
         elif self.strategy == "quantile":
             if self.quantile is None or not np.isscalar(self.quantile):
-                raise ValueError("Quantile must be a scalar in the range "
-                                 "[0.0, 1.0], but got %s." % self.quantile)
+                raise ValueError(
+                    f"Quantile must be a scalar in the range [0.0, 1.0], "
+                    f"but got {self.quantile}."
+                )
 
             percentile = self.quantile * 100.0
             if sample_weight is None:
-                self.constant_ = np.percentile(y, axis=0, q=percentile)
+                self.constant_ = np.percentile(
+                    y, q=percentile, axis=0, interpolation=interpolation,
+                )
             else:
-                self.constant_ = [_weighted_percentile(y[:, k], sample_weight,
-                                                       percentile=percentile)
-                                  for k in range(self.n_outputs_)]
+                self.constant_ = [
+                    _weighted_percentile(
+                        y[:, k], sample_weight, percentile=percentile,
+                        interpolation=interpolation,
+                    )
+                    for k in range(self.n_outputs_)
+                ]
 
         elif self.strategy == "constant":
             if self.constant is None:

diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -229,7 +229,7 @@ def check_regression_dataset(loss, subsample):
 
         y_pred = reg.predict(X_reg)
         mse = mean_squared_error(y_reg, y_pred)
-        assert mse < 0.04
+        assert mse < 0.05
 
         if last_y_pred is not None:
             # FIXME: We temporarily bypass this test. This is due to the fact

diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py
@@ -664,12 +664,20 @@ def test_dummy_regressor_sample_weight(n_samples=10):
     est = DummyRegressor(strategy="mean").fit(X, y, sample_weight)
     assert est.constant_ == np.average(y, weights=sample_weight)
 
-    est = DummyRegressor(strategy="median").fit(X, y, sample_weight)
-    assert est.constant_ == _weighted_percentile(y, sample_weight, 50.)
-
-    est = DummyRegressor(strategy="quantile", quantile=.95).fit(X, y,
-                                                                sample_weight)
-    assert est.constant_ == _weighted_percentile(y, sample_weight, 95.)
+    interpolation = "linear"
+    est = DummyRegressor(strategy="median", interpolation=interpolation)
+    est.fit(X, y, sample_weight)
+    assert est.constant_ == _weighted_percentile(
+        y, sample_weight, 50., interpolation=interpolation,
+    )
+
+    est = DummyRegressor(
+        strategy="quantile", quantile=.95, interpolation=interpolation,
+    )
+    est.fit(X, y, sample_weight)
+    assert est.constant_ == _weighted_percentile(
+        y, sample_weight, 95., interpolation=interpolation,
+    )
 
 
 def test_dummy_regressor_on_3D_array():
@@ -764,3 +772,64 @@ def test_n_features_in_(Dummy):
     assert not hasattr(d, 'n_features_in_')
     d.fit(X, y)
     assert d.n_features_in_ is None
+
+
+@pytest.mark.filterwarnings("ignore:From 0.26 and onward, interpolation will")
+@pytest.mark.parametrize(
+    "strategy, quantile", [("median", 0.5), ("quantile", 0.9)]
+)
+def test_dummy_regressor_default_legacy_behaviour(strategy, quantile):
+    # DummyRegressor will interpolate the following manner:
+    # * 'linear' if we are using np.median and np.percentile which is the case
+    #   when `sample_weight` is None.
+    # * 'nearest' if we are using `_weighted_percentile` which is the case
+    #   when `sample_weight` is not None.
+
+    rng = np.random.RandomState(seed=1)
+
+    n_samples = 100
+    X = [[0]] * n_samples
+    y = rng.rand(n_samples)
+    sample_weight = rng.rand(n_samples)
+
+    params = {"strategy": strategy, "quantile": quantile}
+    regressor = DummyRegressor(**params)
+    percentile = quantile * 100
+
+    regressor.fit(X, y)
+    assert regressor.constant_ == pytest.approx(
+        np.percentile(y, q=percentile, axis=0)
+    )
+
+    regressor.fit(X, y, sample_weight=sample_weight)
+    assert regressor.constant_ == pytest.approx(
+        _weighted_percentile(
+            y, sample_weight, percentile=percentile, interpolation="nearest",
+        )
+    )
+
+
+@pytest.mark.parametrize(
+    "strategy, quantile", [("median", 0.5), ("quantile", 0.9)]
+)
+@pytest.mark.parametrize(
+    "interpolation, WarningType, expected_n_warnings",
+    [(None, FutureWarning, 1), ("linear", None, 0)]
+)
+def test_dummy_regressort_future_warning_interpolation(
+    strategy, quantile, interpolation, WarningType, expected_n_warnings,
+):
+    rng = np.random.RandomState(seed=1)
+
+    n_samples = 100
+    X = [[0]] * n_samples
+    y = rng.rand(n_samples)
+    sample_weight = rng.rand(n_samples)
+
+    regressor = DummyRegressor(
+        strategy=strategy, quantile=quantile, interpolation=interpolation,
+    )
+
+    with pytest.warns(WarningType) as record:
+        regressor.fit(X, y, sample_weight=sample_weight)
+    assert len(record) == expected_n_warnings