8000 FIX `permutation_importance` with polars dataframe raises warning on feature names by Charlie-XIAO · Pull Request #28513 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content
8000

FIX permutation_importance with polars dataframe raises warning on feature names #28513

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
8000 Select commit Hold shift + click to select a range
75ef9e1
FIX feature name warning in permutation_importance with polars DataFrame
Charlie-XIAO Feb 22, 2024
b05d4d0
Merge remote-tracking branch 'upstream/main' into perm-imp-polars
Charlie-XIAO Feb 23, 2024
f514b83
fix deprecation warning convert array of dim>0 to scalar
Charlie-XIAO Feb 23, 2024
dc49e93
changelog added
Charlie-XIAO Feb 23, 2024
004a82f
changelog updated
Charlie-XIAO Feb 23, 2024
6006dbf
try to use adapters
Charlie-XIAO Feb 28, 2024
f2f3b3d
Merge remote-tracking branch 'upstream/main' into perm-imp-polars
Charlie-XIAO Feb 29, 2024
ce24263
utilize _safe_indexing to make adapter use a bit cleaner
Charlie-XIAO Feb 29, 2024
50eb852
Merge remote-tracking branch 'upstream/main' into perm-imp-polars
Charlie-XIAO Feb 29, 2024
aff405a
Merge remote-tracking branch 'upstream/main' into perm-imp-polars
Charlie-XIAO Mar 21, 2024
2b9e23b
merge conflicts
Charlie-XIAO Mar 21, 2024
fc736ce
try to fix on Linux_Docker debian_atlas_32bit
Charlie-XIAO Mar 22, 2024
48e7b30
add a comment (not 100% sure it is true)
Charlie-XIAO Mar 22, 2024
40952c7
Merge branch 'main' into perm-imp-polars
glemaitre Apr 9, 2024
6e30747
Merge remote-tracking branch 'upstream/main' into perm-imp-polars
Charlie-XIAO Apr 9, 2024
d1976b7
partial resolution of suggestions
Charlie-XIAO Apr 9, 2024
139cc60
move the _get_adapter_from_container function
Charlie-XIAO Apr 10, 2024
a3a3f02
Merge remote-tracking branch 'upstream/main' into perm-imp-polars
Charlie-XIAO Apr 10, 2024
e53cd87
Merge remote-tracking branch 'upstream/main' into perm-imp-polars
Charlie-XIAO May 22, 2024
05b005e
move changelog entry
Charlie-XIAO May 22, 2024
642f925
hide complexity in _safe_assign
Charlie-XIAO May 23, 2024
d19dfdd
Merge remote-tracking branch 'upstream/main' into perm-imp-polars
Charlie-XIAO May 23, 2024
ae34a23
case when value is scalar
Charlie-XIAO May 23, 2024
4816627
case when value is scalar
Charlie-XIAO May 23, 2024
32990fd
Merge remote-tracking branch 'upstream/main' into perm-imp-polars
Charlie-XIAO May 24, 2024
0fc76e3
just ignore sparse_csr_array
Charlie-XIAO May 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/whats_new/v1.6.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ Changelog
whether a given estimator is of category clusterer.
:pr:`28936` by :user:`Christian Veenhuis <ChVeen>`.

:mod:`sklearn.inspection`
..........................

- |Fix| :func:`inspection.permutation_importance` no longer warns about missing feature
names when the estimator is fitted on a polars DataFrame and `X` is also a polars
dataframe. :pr:`28513` by :user:`Yao Xiao <Charlie-XIAO>`.

Thanks to everyone who has contributed to the maintenance and improvement of
the project since version 1.5, including:

Expand Down
2 changes: 1 addition & 1 deletion sklearn/impute/_iterative.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ def _impute_one_feature(
)

# update the feature
_safe_assign(
X_filled = _safe_assign(
X_filled,
imputed_values,
row_indexer=missing_row_mask,
Expand Down
2 changes: 1 addition & 1 deletion sklearn/inspection/_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def _partial_dependence_brute(
X_eval = X.copy()
for new_values in grid:
for i, variable in enumerate(features):
_safe_assign(X_eval, new_values[i], column_indexer=variable)
X_eval = _safe_assign(X_eval, new_values[i], column_indexer=variable)

try:
# Note: predictions is of shape
Expand Down
23 changes: 16 additions & 7 deletions sklearn/inspection/_permutation_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ..metrics import check_scoring, get_scorer_names
from ..model_selection._validation import _aggregate_score_dicts
from ..utils import Bunch, _safe_indexing, check_array, check_random_state
from ..utils._indexing import _safe_assign
from ..utils._param_validation import (
HasMethods,
Integral,
Expand All @@ -16,7 +17,9 @@
StrOptions,
validate_params,
)
from ..utils._set_output import _get_adapter_from_container
from ..utils.parallel import Parallel, delayed
from ..utils.validation import _is_pandas_df, _is_polars_df


def _weights_scorer(scorer, estimator, X, y, sample_weight):
Expand All @@ -35,6 +38,7 @@ def _calculate_permutation_scores(
n_repeats,
scorer,
max_samples,
adapter,
):
"""Calculate score when `col_idx` is permuted."""
random_state = check_random_state(random_state)
Expand All @@ -56,19 +60,20 @@ def _calculate_permutation_scores(
y = _safe_indexing(y, row_indices, axis=0)
if sample_weight is not None:
sample_weight = _safe_indexing(sample_weight, row_indices, axis=0)
elif adapter is not None:
X_permuted = adapter.copy(X)
else:
X_permuted = X.copy()

scores = []
shuffling_idx = np.arange(X_permuted.shape[0])
for _ in range(n_repeats):
random_state.shuffle(shuffling_idx)
if hasattr(X_permuted, "iloc"):
col = X_permuted.iloc[shuffling_idx, col_idx]
col.index = X_permuted.index
X_permuted[X_permuted.columns[col_idx]] = col
else:
X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx]
col = _safe_indexing(
_safe_indexing(X_permuted, col_idx, axis=1),
shuffling_idx,
)
X_permuted = _safe_assign(X_permuted, col, column_indexer=col_idx)
scores.append(_weights_scorer(scorer, estimator, X_permuted, y, sample_weight))

if isinstance(scores[0], dict):
Expand Down Expand Up @@ -262,7 +267,10 @@ def permutation_importance(
>>> result.importances_std
array([0.2211..., 0. , 0. ])
"""
if not hasattr(X, "iloc"):
if _is_pandas_df(X) or _is_polars_df(X):
adapter = _get_adapter_from_container(X)
else:
adapter = None
X = check_array(X, force_all_finite="allow-nan", dtype=None)

# Precompute random seed from the random state to be used
Expand Down Expand Up @@ -292,6 +300,7 @@ def permutation_importance(
n_repeats,
scorer,
max_samples,
adapter,
)
for col_idx in range(X.shape[1])
)
Expand Down
64 changes: 31 additions & 33 deletions sklearn/inspection/tests/test_permutation_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,23 +66,25 @@ def test_permutation_importance_correlated_feature_regression(

@pytest.mark.parametrize("n_jobs", [1, 2])
@pytest.mark.parametrize("max_samples", [0.5, 1.0])
def test_permutation_importance_correlated_feature_regression_pandas(
n_jobs, max_samples
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
def test_permutation_importance_correlated_feature_regression_dataframe(
n_jobs, max_samples, dataframe_lib
):
pd = pytest.importorskip("pandas")

# Make sure that feature highly correlated to the target have a higher
# importance
rng = np.random.RandomState(42)
n_repeats = 5

dataset = load_iris()
X, y = dataset.data, dataset.target
y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1)
y_with_little_noise = y + rng.normal(scale=0.001, size=y.shape[0])

# Adds feature correlated with y as the last column
X = pd.DataFrame(X, columns=dataset.feature_names)
X["correlated_feature"] = y_with_little_noise
X = _convert_container(X, dataframe_lib, columns_name=dataset.feature_names)
if dataframe_lib == "pandas":
X["correlated_feature"] = y_with_little_noise
else:
X = X.with_columns(correlated_feature=y_with_little_noise)

clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X, y)
Expand Down Expand Up @@ -215,13 +217,14 @@ def test_permutation_importance_mixed_types():
assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1])


def test_permutation_importance_mixed_types_pandas():
pd = pytest.importorskip("pandas")
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
def test_permutation_importance_mixed_types_dataframe(dataframe_lib):
rng = np.random.RandomState(42)
n_repeats = 5

# Last column is correlated with y
X = pd.DataFrame({"col1": [1.0, 2.0, 3.0, np.nan], "col2": ["a", "b", "a", "b"]})
X = [[1.0, "a"], [2.0, "b"], [3.0, "a"], [np.nan, "b"]]
X = _convert_container(X, dataframe_lib, columns_name=["col1", "col2"])
y = np.array([0, 1, 0, 1])

num_preprocess = make_pipeline(SimpleImputer(), StandardScaler())
Expand Down Expand Up @@ -300,41 +303,36 @@ def test_permutation_importance_equivalence_sequential_parallel(max_samples):

@pytest.mark.parametrize("n_jobs", [None, 1, 2])
@pytest.mark.parametrize("max_samples", [0.5, 1.0])
def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples):
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
def test_permutation_importance_equivalence_array_dataframe(
n_jobs, max_samples, dataframe_lib
):
# This test checks that the column shuffling logic has the same behavior
# both a dataframe and a simple numpy array.
pd = pytest.importorskip("pandas")

# regression test to make sure that sequential and parallel calls will
# output the same results.
X, y = make_regression(n_samples=100, n_features=5, random_state=0)
X_df = pd.DataFrame(X)

# Add a categorical feature that is statistically linked to y:
binner = KBinsDiscretizer(n_bins=3, encode="ordinal")
cat_column = binner.fit_transform(y.reshape(-1, 1))

# Concatenate the extra column to the numpy array: integers will be
# cast to float values
cat_column = binner.fit_transform(y.reshape(-1, 1)).astype(str)
X = np.hstack([X, cat_column])
assert X.dtype.kind == "f"

# Insert extra column as a non-numpy-native dtype (while keeping backward
# compat for old pandas versions):
if hasattr(pd, "Categorical"):
cat_column = pd.Categorical(cat_column.ravel())
else:
cat_column = cat_column.ravel()
new_col_idx = len(X_df.columns)
X_df[new_col_idx] = cat_column
assert X_df[new_col_idx].dtype == cat_column.dtype

# Stich an arbitrary index to the dataframe:
X_df.index = np.arange(len(X_df)).astype(str)
# cat_column is converted to categorical dtype, which is non-numpy-native
X_df = _convert_container(
X,
dataframe_lib,
columns_name=[f"col_{i}" for i in range(X.shape[1])],
categorical_feature_names=[f"col_{X.shape[1] - 1}"],
)

rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
rf.fit(X, y)

rf_df = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
rf_df.fit(X_df, y)

n_repeats = 3
importance_array = permutation_importance(
rf,
Expand All @@ -352,10 +350,10 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples)
imp_max = importance_array["importances"].max()
assert imp_max - imp_min > 0.3

# Now check that importances computed on dataframe matche the values
# Now check that importances computed on dataframe match the values
# of those computed on the array with the same data.
importance_dataframe = permutation_importance(
rf,
rf_df,
X_df,
y,
n_repeats=n_repeats,
Expand All @@ -368,7 +366,7 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples)
)


@pytest.mark.parametrize("input_type", ["array", "dataframe"])
@pytest.mark.parametrize("input_type", ["array", "pandas", "polars"])
def test_permutation_importance_large_memmaped_data(input_type):
# Smoke, non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/15810
Expand Down
Loading
Loading
0