8000 API Deprecates values in partial_dependence in favor of pdp_values by thomasjpfan · Pull Request #21809 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

API Deprecates values in partial_dependence in favor of pdp_values #21809

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Feb 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/whats_new/v1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,14 @@ Changelog
- |Enhancement| Added the parameter `fill_value` to :class:`impute.IterativeImputer`.
:pr:`25232` by :user:`Thijs van Weezel <ValueInvestorThijs>`.

:mod:`sklearn.inspection`
.........................

- |API| :func:`inspection.partial_dependence` returns a :class:`utils.Bunch` with
new key: `pdp_values`. The `values` key is deprecated in favor of `pdp_values`
and the `values` key will be removed in 1.5.
:pr:`21809` by `Thomas Fan`_.

:mod:`sklearn.linear_model`
...........................

Expand Down
40 changes: 29 additions & 11 deletions sklearn/inspection/_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,16 +365,26 @@ def partial_dependence(
Only available when ``kind='both'``.

values : seq of 1d ndarrays
The values with which the grid has been created.

.. deprecated:: 1.3
The key `values` has been deprecated in 1.3 and will be removed
in 1.5 in favor of `pdp_values`. See `pdp_values` for details
about the `values` attribute.

pdp_values : seq of 1d ndarrays
The values with which the grid has been created. The generated
grid is a cartesian product of the arrays in ``values``.
``len(values) == len(features)``. The size of each array
``values[j]`` is either ``grid_resolution``, or the number of
grid is a cartesian product of the arrays in ``pdp_values``.
``len(pdp_values) == len(features)``. The size of each array
``pdp_values[j]`` is either ``grid_resolution``, or the number of
unique values in ``X[:, j]``, whichever is smaller.

.. versionadded:: 1.3

``n_outputs`` corresponds to the number of classes in a multi-class
setting, or to the number of tasks for multi-output regression.
For classical regression and binary classification ``n_outputs==1``.
``n_values_feature_j`` corresponds to the size ``values[j]``.
``n_values_feature_j`` corresponds to the size ``pdp_values[j]``.

See Also
--------
Expand Down Expand Up @@ -547,14 +557,22 @@ def partial_dependence(
averaged_predictions = averaged_predictions.reshape(
-1, *[val.shape[0] for val in values]
)
pdp_results = Bunch()

msg = (
"Key: 'values', is deprecated in 1.3 and will be removed in 1.5. "
"Please use 'pdp_values' instead."
)
pdp_results._set_deprecated(
values, new_key="pdp_values", deprecated_key="values", warning_message=msg
)

if kind == "average":
return Bunch(average=averaged_predictions, values=values)
pdp_results["average"] = averaged_predictions
elif kind == "individual":
return Bunch(individual=predictions, values=values)
pdp_results["individual"] = predictions
else: # kind='both'
return Bunch(
average=averaged_predictions,
individual=predictions,
values=values,
)
pdp_results["average"] = averaged_predictions
pdp_results["individual"] = predictions

return pdp_results
6 changes: 3 additions & 3 deletions sklearn/inspection/_plot/partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -1256,7 +1256,7 @@ def plot(
else:
pd_results_ = []
for kind_plot, pd_result in zip(kind, self.pd_results):
current_results = {"values": pd_result["values"]}
current_results = {"pdp_values": pd_result["pdp_values"]}

if kind_plot in ("individual", "both"):
preds = pd_result.individual
Expand All @@ -1274,7 +1274,7 @@ def plot(
# get global min and max average predictions of PD grouped by plot type
pdp_lim = {}
for kind_plot, pdp in zip(kind, pd_results_):
values = pdp["values"]
values = pdp["pdp_values"]
preds = pdp.average if kind_plot == "average" else pdp.individual
min_pd = preds[self.target_idx].min()
max_pd = preds[self.target_idx].max()
Expand Down Expand Up @@ -1402,7 +1402,7 @@ def plot(
):
avg_preds = None
preds = None
feature_values = pd_result["values"]
feature_values = pd_result["pdp_values"]
if kind_plot == "individual":
preds = pd_result.individual
elif kind_plot == "average":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def test_plot_partial_dependence(grid_resolution, pyplot, clf_diabetes, diabetes
target_idx = disp.target_idx

line_data = line.get_data()
assert_allclose(line_data[0], avg_preds["values"][0])
assert_allclose(line_data[0], avg_preds["pdp_values"][0])
assert_allclose(line_data[1], avg_preds.average[target_idx].ravel())

# two feature position
Expand Down Expand Up @@ -243,7 +243,7 @@ def test_plot_partial_dependence_str_features(
assert line.get_alpha() == 0.8

line_data = line.get_data()
assert_allclose(line_data[0], avg_preds["values"][0])
assert_allclose(line_data[0], avg_preds["pdp_values"][0])
assert_allclose(line_data[1], avg_preds.average[target_idx].ravel())

# contour
Expand Down Expand Up @@ -279,7 +279,7 @@ def test_plot_partial_dependence_custom_axes(pyplot, clf_diabetes, diabetes):
target_idx = disp.target_idx

line_data = line.get_data()
assert_allclose(line_data[0], avg_preds["values"][0])
assert_allclose(line_data[0], avg_preds["pdp_values"][0])
assert_allclose(line_data[1], avg_preds.average[target_idx].ravel())

# contour
685C Expand Down Expand Up @@ -466,7 +466,7 @@ def test_plot_partial_dependence_multiclass(pyplot):
disp_target_0.pd_results, disp_symbol.pd_results
):
assert_allclose(int_result.average, symbol_result.average)
assert_allclose(int_result["values"], symbol_result["values"])
assert_allclose(int_result["pdp_values"], symbol_result["pdp_values"])

# check that the pd plots are different for another target
disp_target_1 = PartialDependenceDisplay.from_estimator(
Expand Down
47 changes: 38 additions & 9 deletions sklearn/inspection/tests/test_partial_dependence.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Testing for the partial dependence module.
"""
import warnings

import numpy as np
import pytest
Expand Down Expand Up @@ -108,7 +109,7 @@ def test_output_shape(Estimator, method, data, grid_resolution, features, kind):
kind=kind,
grid_resolution=grid_resolution,
)
pdp, axes = result, result["values"]
pdp, axes = result, result["pdp_values"]

expected_pdp_shape = (n_targets, *[grid_resolution for _ in range(len(features))])
expected_ice_shape = (
Expand Down Expand Up @@ -434,7 +435,7 @@ def test_partial_dependence_easy_target(est, power):
est, features=[target_variable], X=X, grid_resolution=1000, kind="average"
)

new_X = pdp["values"][0].reshape(-1, 1)
new_X = pdp["pdp_values"][0].reshape(-1, 1)
new_y = pdp["average"][0]
# add polynomial features if needed
new_X = PolynomialFeatures(degree=power).fit_transform(new_X)
Expand Down Expand Up @@ -654,7 +655,7 @@ def test_partial_dependence_sample_weight():

pdp = partial_dependence(clf, X, features=[1], kind="average")

assert np.corrcoef(pdp["average"], pdp["values"])[0, 1] > 0.99
assert np.corrcoef(pdp["average"], pdp["pdp_values"])[0, 1] > 0.99


def test_hist_gbdt_sw_not_supported():
Expand Down Expand Up @@ -692,8 +693,8 @@ def test_partial_dependence_pipeline():
)
assert_allclose(pdp_pipe["average"], pdp_clf["average"])
assert_allclose(
pdp_pipe["values"][0],
pdp_clf["values"][0] * scaler.scale_[features] + scaler.mean_[features],
pdp_pipe["pdp_values"][0],
pdp_clf["pdp_values"][0] * scaler.scale_[features] + scaler.mean_[features],
)


Expand Down Expand Up @@ -761,11 +762,11 @@ def test_partial_dependence_dataframe(estimator, preprocessor, features):
if preprocessor is not None:
scaler = preprocessor.named_transformers_["standardscaler"]
assert_allclose(
pdp_pipe["values"][1],
pdp_clf["values"][1] * scaler.scale_[1] + scaler.mean_[1],
pdp_pipe["pdp_values"][1],
pdp_clf["pdp_values"][1] * scaler.scale_[1] + scaler.mean_[1],
)
else:
assert_allclose(pdp_pipe["values"][1], pdp_clf["values"][1])
assert_allclose(pdp_pipe["pdp_values"][1], pdp_clf["pdp_values"][1])


@pytest.mark.parametrize(
Expand Down Expand Up @@ -796,7 +797,7 @@ def test_partial_dependence_feature_type(features, expected_pd_shape):
pipe, df, features=features, grid_resolution=10, kind="average"
)
assert pdp_pipe["average"].shape == expected_pd_shape
assert len(pdp_pipe["values"]) == len(pdp_pipe["average"].shape) - 1
assert len(pdp_pipe["pdp_values"]) == len(pdp_pipe["average"].shape) - 1


@pytest.mark.parametrize(
Expand Down Expand Up @@ -836,3 +837,31 @@ def test_kind_average_and_average_of_individual(Estimator, data):
pdp_ind = partial_dependence(est, X=X, features=[1, 2], kind="individual")
avg_ind = np.mean(pdp_ind["individual"], axis=1)
assert_allclose(avg_ind, pdp_avg["average"])


# TODO(1.5): Remove when bunch values is deprecated in 1.5
def test_partial_dependence_bunch_values_deprecated():
"""Test that deprecation warning is raised when values is accessed."""

est = LogisticRegression()
(X, y), _ = binary_classification_data
est.fit(X, y)

pdp_avg = partial_dependence(est, X=X, features=[1, 2], kind="average")

msg = (
"Key: 'values', is deprecated in 1.3 and will be "
"removed in 1.5. Please use 'pdp_values' instead"
)

with warnings.catch_warnings():
# Does not raise warnings with "pdp_values"
warnings.simplefilter("error", FutureWarning)
pdp_values = pdp_avg["pdp_values"]

with pytest.warns(FutureWarning, match=msg):
# Warns for "values"
values = pdp_avg["values"]

# "values" and "pdp_values" are the same object
assert values is pdp_values
19 changes: 19 additions & 0 deletions sklearn/utils/_bunch.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import warnings


class Bunch(dict):
"""Container object exposing keys as attributes.

Expand All @@ -24,6 +27,22 @@ class Bunch(dict):
def __init__(self, **kwargs):
super().__init__(kwargs)

# Map from deprecated key to warning message
self.__dict__["_deprecated_key_to_warnings"] = {}

def __getitem__(self, key):
if key in self.__dict__.get("_deprecated_key_to_warnings", {}):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, I did not think about a Bunch that could be pickled in the past.

warnings.warn(
self._deprecated_key_to_warnings[key],
FutureWarning,
)
return super().__getitem__(key)

def _set_deprecated(self, value, *, new_key, deprecated_key, warning_message):
"""Set key in dictionary to be deprecated with its warning message."""
self.__dict__["_deprecated_key_to_warnings"][deprecated_key] = warning_message
self[new_key] = self[deprecated_key] = value

def __setattr__(self, key, value):
self[key] = value

Expand Down
32 changes: 32 additions & 0 deletions sklearn/utils/tests/test_bunch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import warnings

import numpy as np
import pytest

from sklearn.utils import Bunch


def test_bunch_attribute_deprecation():
"""Check that bunch raises deprecation message with `__getattr__`."""
bunch = Bunch()
values = np.asarray([1, 2, 3])
msg = (
"Key: 'values', is deprecated in 1.3 and will be "
"removed in 1.5. Please use 'pdp_values' instead"
)
bunch._set_deprecated(
values, new_key="pdp_values", deprecated_key="values", warning_message=msg
)

with warnings.catch_warnings():
# Does not warn for "pdp_values"
warnings.simplefilter("error")
v = bunch["pdp_values"]

assert v is values

with pytest.warns(FutureWarning, match=msg):
# Warns for "values"
v = bunch["values"]

assert v is values
0