scikit-learn · Charlie-XIAO · Feb 22, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -81,6 +81,13 @@ Changelog
   whether a given estimator is of category clusterer.
   :pr:`28936` by :user:`Christian Veenhuis <ChVeen>`.
 
+:mod:`sklearn.inspection`
+..........................
+
+- |Fix| :func:`inspection.permutation_importance` no longer warns about missing feature
+  names when the estimator is fitted on a polars DataFrame and `X` is also a polars
+  dataframe. :pr:`28513` by :user:`Yao Xiao <Charlie-XIAO>`.
+
 Thanks to everyone who has contributed to the maintenance and improvement of
 the project since version 1.5, including:
 

diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
@@ -457,7 +457,7 @@ def _impute_one_feature(
             )
 
         # update the feature
-        _safe_assign(
+        X_filled = _safe_assign(
             X_filled,
             imputed_values,
             row_indexer=missing_row_mask,

diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py
@@ -292,7 +292,7 @@ def _partial_dependence_brute(
     X_eval = X.copy()
     for new_values in grid:
         for i, variable in enumerate(features):
-            _safe_assign(X_eval, new_values[i], column_indexer=variable)
+            X_eval = _safe_assign(X_eval, new_values[i], column_indexer=variable)
 
         try:
             # Note: predictions is of shape

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
@@ -8,6 +8,7 @@
 from ..metrics import check_scoring, get_scorer_names
 from ..model_selection._validation import _aggregate_score_dicts
 from ..utils import Bunch, _safe_indexing, check_array, check_random_state
+from ..utils._indexing import _safe_assign
 from ..utils._param_validation import (
     HasMethods,
     Integral,
@@ -16,7 +17,9 @@
     StrOptions,
     validate_params,
 )
+from ..utils._set_output import _get_adapter_from_container
 from ..utils.parallel import Parallel, delayed
+from ..utils.validation import _is_pandas_df, _is_polars_df
 
 
 def _weights_scorer(scorer, estimator, X, y, sample_weight):
@@ -35,6 +38,7 @@ def _calculate_permutation_scores(
     n_repeats,
     scorer,
     max_samples,
+    adapter,
 ):
     """Calculate score when `col_idx` is permuted."""
     random_state = check_random_state(random_state)
@@ -56,19 +60,20 @@ def _calculate_permutation_scores(
         y = _safe_indexing(y, row_indices, axis=0)
         if sample_weight is not None:
             sample_weight = _safe_indexing(sample_weight, row_indices, axis=0)
+    elif adapter is not None:
+        X_permuted = adapter.copy(X)
     else:
         X_permuted = X.copy()
 
     scores = []
     shuffling_idx = np.arange(X_permuted.shape[0])
     for _ in range(n_repeats):
         random_state.shuffle(shuffling_idx)
-        if hasattr(X_permuted, "iloc"):
-            col = X_permuted.iloc[shuffling_idx, col_idx]
-            col.index = X_permuted.index
-            X_permuted[X_permuted.columns[col_idx]] = col
-        else:
-            X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx]
+        col = _safe_indexing(
+            _safe_indexing(X_permuted, col_idx, axis=1),
+            shuffling_idx,
+        )
+        X_permuted = _safe_assign(X_permuted, col, column_indexer=col_idx)
         scores.append(_weights_scorer(scorer, estimator, X_permuted, y, sample_weight))
 
     if isinstance(scores[0], dict):
@@ -262,7 +267,10 @@ def permutation_importance(
     >>> result.importances_std
     array([0.2211..., 0.       , 0.       ])
     """
-    if not hasattr(X, "iloc"):
+    if _is_pandas_df(X) or _is_polars_df(X):
+        adapter = _get_adapter_from_container(X)
+    else:
+        adapter = None
         X = check_array(X, force_all_finite="allow-nan", dtype=None)
 
     # Precompute random seed from the random state to be used
@@ -292,6 +300,7 @@ def permutation_importance(
             n_repeats,
             scorer,
             max_samples,
+            adapter,
         )
         for col_idx in range(X.shape[1])
     )

diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
@@ -66,23 +66,25 @@ def test_permutation_importance_correlated_feature_regression(
 
 @pytest.mark.parametrize("n_jobs", [1, 2])
 @pytest.mark.parametrize("max_samples", [0.5, 1.0])
-def test_permutation_importance_correlated_feature_regression_pandas(
-    n_jobs, max_samples
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+def test_permutation_importance_correlated_feature_regression_dataframe(
+    n_jobs, max_samples, dataframe_lib
 ):
-    pd = pytest.importorskip("pandas")
-
     # Make sure that feature highly correlated to the target have a higher
     # importance
     rng = np.random.RandomState(42)
     n_repeats = 5
 
     dataset = load_iris()
     X, y = dataset.data, dataset.target
-    y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1)
+    y_with_little_noise = y + rng.normal(scale=0.001, size=y.shape[0])
 
     # Adds feature correlated with y as the last column
-    X = pd.DataFrame(X, columns=dataset.feature_names)
-    X["correlated_feature"] = y_with_little_noise
+    X = _convert_container(X, dataframe_lib, columns_name=dataset.feature_names)
+    if dataframe_lib == "pandas":
+        X["correlated_feature"] = y_with_little_noise
+    else:
+        X = X.with_columns(correlated_feature=y_with_little_noise)
 
     clf = RandomForestClassifier(n_estimators=10, random_state=42)
     clf.fit(X, y)
@@ -215,13 +217,14 @@ def test_permutation_importance_mixed_types():
     assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1])
 
 
-def test_permutation_importance_mixed_types_pandas():
-    pd = pytest.importorskip("pandas")
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+def test_permutation_importance_mixed_types_dataframe(dataframe_lib):
     rng = np.random.RandomState(42)
     n_repeats = 5
 
     # Last column is correlated with y
-    X = pd.DataFrame({"col1": [1.0, 2.0, 3.0, np.nan], "col2": ["a", "b", "a", "b"]})
+    X = [[1.0, "a"], [2.0, "b"], [3.0, "a"], [np.nan, "b"]]
+    X = _convert_container(X, dataframe_lib, columns_name=["col1", "col2"])
     y = np.array([0, 1, 0, 1])
 
     num_preprocess = make_pipeline(SimpleImputer(), StandardScaler())
@@ -300,41 +303,36 @@ def test_permutation_importance_equivalence_sequential_parallel(max_samples):
 
 @pytest.mark.parametrize("n_jobs", [None, 1, 2])
 @pytest.mark.parametrize("max_samples", [0.5, 1.0])
-def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples):
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+def test_permutation_importance_equivalence_array_dataframe(
+    n_jobs, max_samples, dataframe_lib
+):
     # This test checks that the column shuffling logic has the same behavior
     # both a dataframe and a simple numpy array.
-    pd = pytest.importorskip("pandas")
 
     # regression test to make sure that sequential and parallel calls will
     # output the same results.
     X, y = make_regression(n_samples=100, n_features=5, random_state=0)
-    X_df = pd.DataFrame(X)
 
     # Add a categorical feature that is statistically linked to y:
        binner = KBinsDiscretizer(n_bins=3, encode="ordinal")
-    cat_column = binner.fit_transform(y.reshape(-1, 1))
-
-    # Concatenate the extra column to the numpy array: integers will be
-    # cast to float values
+    cat_column = binner.fit_transform(y.reshape(-1, 1)).astype(str)
     X = np.hstack([X, cat_column])
-    assert X.dtype.kind == "f"
 
-    # Insert extra column as a non-numpy-native dtype (while keeping backward
-    # compat for old pandas versions):
-    if hasattr(pd, "Categorical"):
-        cat_column = pd.Categorical(cat_column.ravel())
-    else:
-        cat_column = cat_column.ravel()
-    new_col_idx = len(X_df.columns)
-    X_df[new_col_idx] = cat_column
-    assert X_df[new_col_idx].dtype == cat_column.dtype
-
-    # Stich an arbitrary index to the dataframe:
-    X_df.index = np.arange(len(X_df)).astype(str)
+    # cat_column is converted to categorical dtype, which is non-numpy-native
+    X_df = _convert_container(
+        X,
+        dataframe_lib,
+        columns_name=[f"col_{i}" for i in range(X.shape[1])],
+        categorical_feature_names=[f"col_{X.shape[1] - 1}"],
+    )
 
     rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
     rf.fit(X, y)
 
+    rf_df = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
+    rf_df.fit(X_df, y)
+
     n_repeats = 3
     importance_array = permutation_importance(
         rf,
@@ -352,10 +350,10 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples)
     imp_max = importance_array["importances"].max()
     assert imp_max - imp_min > 0.3
 
-    # Now check that importances computed on dataframe matche the values
+    # Now check that importances computed on dataframe match the values
     # of those computed on the array with the same data.
     importance_dataframe = permutation_importance(
-        rf,
+        rf_df,
         X_df,
         y,
         n_repeats=n_repeats,
@@ -368,7 +366,7 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples)
     )
 
 
-@pytest.mark.parametrize("input_type", ["array", "dataframe"])
+@pytest.mark.parametrize("input_type", ["array", "pandas", "polars"])
 def test_permutation_importance_large_memmaped_data(input_type):
     # Smoke, non-regression test for:
     # https://github.com/scikit-learn/scikit-learn/issues/15810