diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 53b0eb017fc57..8e1cf535ff312 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -81,6 +81,13 @@ Changelog whether a given estimator is of category clusterer. :pr:`28936` by :user:`Christian Veenhuis `. +:mod:`sklearn.inspection` +.......................... + +- |Fix| :func:`inspection.permutation_importance` no longer warns about missing feature + names when the estimator is fitted on a polars DataFrame and `X` is also a polars + dataframe. :pr:`28513` by :user:`Yao Xiao `. + Thanks to everyone who has contributed to the maintenance and improvement of the project since version 1.5, including: diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 41f903061c34d..8860d55bdb873 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -457,7 +457,7 @@ def _impute_one_feature( ) # update the feature - _safe_assign( + X_filled = _safe_assign( X_filled, imputed_values, row_indexer=missing_row_mask, diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index b6ca19c407f34..24b5e3bf53d4f 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -292,7 +292,7 @@ def _partial_dependence_brute( X_eval = X.copy() for new_values in grid: for i, variable in enumerate(features): - _safe_assign(X_eval, new_values[i], column_indexer=variable) + X_eval = _safe_assign(X_eval, new_values[i], column_indexer=variable) try: # Note: predictions is of shape diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 659db143153cc..bbb9c3b630d96 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -8,6 +8,7 @@ from ..metrics import check_scoring, get_scorer_names from ..model_selection._validation import _aggregate_score_dicts from ..utils import Bunch, _safe_indexing, check_array, check_random_state +from ..utils._indexing import _safe_assign from ..utils._param_validation import ( HasMethods, Integral, @@ -16,7 +17,9 @@ StrOptions, validate_params, ) +from ..utils._set_output import _get_adapter_from_container from ..utils.parallel import Parallel, delayed +from ..utils.validation import _is_pandas_df, _is_polars_df def _weights_scorer(scorer, estimator, X, y, sample_weight): @@ -35,6 +38,7 @@ def _calculate_permutation_scores( n_repeats, scorer, max_samples, + adapter, ): """Calculate score when `col_idx` is permuted.""" random_state = check_random_state(random_state) @@ -56,6 +60,8 @@ def _calculate_permutation_scores( y = _safe_indexing(y, row_indices, axis=0) if sample_weight is not None: sample_weight = _safe_indexing(sample_weight, row_indices, axis=0) + elif adapter is not None: + X_permuted = adapter.copy(X) else: X_permuted = X.copy() @@ -63,12 +69,11 @@ def _calculate_permutation_scores( shuffling_idx = np.arange(X_permuted.shape[0]) for _ in range(n_repeats): random_state.shuffle(shuffling_idx) - if hasattr(X_permuted, "iloc"): - col = X_permuted.iloc[shuffling_idx, col_idx] - col.index = X_permuted.index - X_permuted[X_permuted.columns[col_idx]] = col - else: - X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx] + col = _safe_indexing( + _safe_indexing(X_permuted, col_idx, axis=1), + shuffling_idx, + ) + X_permuted = _safe_assign(X_permuted, col, column_indexer=col_idx) scores.append(_weights_scorer(scorer, estimator, X_permuted, y, sample_weight)) if isinstance(scores[0], dict): @@ -262,7 +267,10 @@ def permutation_importance( >>> result.importances_std array([0.2211..., 0. , 0. ]) """ - if not hasattr(X, "iloc"): + if _is_pandas_df(X) or _is_polars_df(X): + adapter = _get_adapter_from_container(X) + else: + adapter = None X = check_array(X, force_all_finite="allow-nan", dtype=None) # Precompute random seed from the random state to be used @@ -292,6 +300,7 @@ def permutation_importance( n_repeats, scorer, max_samples, + adapter, ) for col_idx in range(X.shape[1]) ) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index 478a10515aa01..4a695745fa0bb 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -66,11 +66,10 @@ def test_permutation_importance_correlated_feature_regression( @pytest.mark.parametrize("n_jobs", [1, 2]) @pytest.mark.parametrize("max_samples", [0.5, 1.0]) -def test_permutation_importance_correlated_feature_regression_pandas( - n_jobs, max_samples +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +def test_permutation_importance_correlated_feature_regression_dataframe( + n_jobs, max_samples, dataframe_lib ): - pd = pytest.importorskip("pandas") - # Make sure that feature highly correlated to the target have a higher # importance rng = np.random.RandomState(42) @@ -78,11 +77,14 @@ def test_permutation_importance_correlated_feature_regression_pandas( dataset = load_iris() X, y = dataset.data, dataset.target - y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1) + y_with_little_noise = y + rng.normal(scale=0.001, size=y.shape[0]) # Adds feature correlated with y as the last column - X = pd.DataFrame(X, columns=dataset.feature_names) - X["correlated_feature"] = y_with_little_noise + X = _convert_container(X, dataframe_lib, columns_name=dataset.feature_names) + if dataframe_lib == "pandas": + X["correlated_feature"] = y_with_little_noise + else: + X = X.with_columns(correlated_feature=y_with_little_noise) clf = RandomForestClassifier(n_estimators=10, random_state=42) clf.fit(X, y) @@ -215,13 +217,14 @@ def test_permutation_importance_mixed_types(): assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1]) -def test_permutation_importance_mixed_types_pandas(): - pd = pytest.importorskip("pandas") +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +def test_permutation_importance_mixed_types_dataframe(dataframe_lib): rng = np.random.RandomState(42) n_repeats = 5 # Last column is correlated with y - X = pd.DataFrame({"col1": [1.0, 2.0, 3.0, np.nan], "col2": ["a", "b", "a", "b"]}) + X = [[1.0, "a"], [2.0, "b"], [3.0, "a"], [np.nan, "b"]] + X = _convert_container(X, dataframe_lib, columns_name=["col1", "col2"]) y = np.array([0, 1, 0, 1]) num_preprocess = make_pipeline(SimpleImputer(), StandardScaler()) @@ -300,41 +303,36 @@ def test_permutation_importance_equivalence_sequential_parallel(max_samples): @pytest.mark.parametrize("n_jobs", [None, 1, 2]) @pytest.mark.parametrize("max_samples", [0.5, 1.0]) -def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples): +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +def test_permutation_importance_equivalence_array_dataframe( + n_jobs, max_samples, dataframe_lib +): # This test checks that the column shuffling logic has the same behavior # both a dataframe and a simple numpy array. - pd = pytest.importorskip("pandas") # regression test to make sure that sequential and parallel calls will # output the same results. X, y = make_regression(n_samples=100, n_features=5, random_state=0) - X_df = pd.DataFrame(X) # Add a categorical feature that is statistically linked to y: binner = KBinsDiscretizer(n_bins=3, encode="ordinal") - cat_column = binner.fit_transform(y.reshape(-1, 1)) - - # Concatenate the extra column to the numpy array: integers will be - # cast to float values + cat_column = binner.fit_transform(y.reshape(-1, 1)).astype(str) X = np.hstack([X, cat_column]) - assert X.dtype.kind == "f" - # Insert extra column as a non-numpy-native dtype (while keeping backward - # compat for old pandas versions): - if hasattr(pd, "Categorical"): - cat_column = pd.Categorical(cat_column.ravel()) - else: - cat_column = cat_column.ravel() - new_col_idx = len(X_df.columns) - X_df[new_col_idx] = cat_column - assert X_df[new_col_idx].dtype == cat_column.dtype - - # Stich an arbitrary index to the dataframe: - X_df.index = np.arange(len(X_df)).astype(str) + # cat_column is converted to categorical dtype, which is non-numpy-native + X_df = _convert_container( + X, + dataframe_lib, + columns_name=[f"col_{i}" for i in range(X.shape[1])], + categorical_feature_names=[f"col_{X.shape[1] - 1}"], + ) rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0) rf.fit(X, y) + rf_df = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0) + rf_df.fit(X_df, y) + n_repeats = 3 importance_array = permutation_importance( rf, @@ -352,10 +350,10 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples) imp_max = importance_array["importances"].max() assert imp_max - imp_min > 0.3 - # Now check that importances computed on dataframe matche the values + # Now check that importances computed on dataframe match the values # of those computed on the array with the same data. importance_dataframe = permutation_importance( - rf, + rf_df, X_df, y, n_repeats=n_repeats, @@ -368,7 +366,7 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples) ) -@pytest.mark.parametrize("input_type", ["array", "dataframe"]) +@pytest.mark.parametrize("input_type", ["array", "pandas", "polars"]) def test_permutation_importance_large_memmaped_data(input_type): # Smoke, non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15810 diff --git a/sklearn/utils/_indexing.py b/sklearn/utils/_indexing.py index ca2327f2bb109..35c6ce968b498 100644 --- a/sklearn/utils/_indexing.py +++ b/sklearn/utils/_indexing.py @@ -13,6 +13,8 @@ from .validation import ( _is_arraylike_not_scalar, _is_pandas_df, + _is_pandas_df_or_series, + _is_polars_df, _is_polars_df_or_series, _use_interchange_protocol, check_array, @@ -174,7 +176,7 @@ def _determine_key_type(key, accept_slice=True): def _safe_indexing(X, indices, *, axis=0): - """Return rows, items or columns of X using indices. + """Return rows, items or columns of `X` using indices. .. warning:: @@ -184,7 +186,7 @@ def _safe_indexing(X, indices, *, axis=0): Parameters ---------- - X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series + X : array-like, sparse matrix, dataframe, series Data from which to sample rows, items or columns. `list` are only supported when `axis=0`. indices : bool, int, str, slice, array-like @@ -270,40 +272,144 @@ def _safe_indexing(X, indices, *, axis=0): def _safe_assign(X, values, *, row_indexer=None, column_indexer=None): - """Safe assignment to a numpy array, sparse matrix, or pandas dataframe. + """Safe assignment to a 2-dimensional `X` by indices. + + This returns the updated `X` after assigning `values`. However, there is no + guarantee whether `X` will be modified in-place. Parameters ---------- - X : {ndarray, sparse-matrix, dataframe} + X : ndarray, sparse matrix, dataframe Array to be modified. It is expected to be 2-dimensional. - values : ndarray + values : scalar or array-like The values to be assigned to `X`. - row_indexer : array-like, dtype={int, bool}, default=None + - If both `row_indexer` and `column_indexer` are scalars, it should be scalar. + - If one of `row_indexer` or `column_indexer` is scalar, it should be + 1-dimensional with the corresponding shape or scalar that will be broadcasted. + - Otherwise, it should be 2-dimensional with the corresponding shape or scalar + that will be broadcasted. + + row_indexer : scalar or array-like, dtype={int, bool}, default=None A 1-dimensional array to select the rows of interest. If `None`, all rows are selected. - column_indexer : array-like, dtype={int, bool}, default=None + column_indexer : scalar or array-like, dtype={int, bool}, default=None A 1-dimensional array to select the columns of interest. If `None`, all columns are selected. """ - row_indexer = slice(None, None, None) if row_indexer is None else row_indexer + if len(X.shape) != 2: + raise ValueError( + f"Only 2D containers are supported; got shape {X.shape} instead." + ) + + if _is_pandas_df(X): + return _safe_assign_pandas(X, values, row_indexer, column_indexer) + elif _is_polars_df(X): + return _safe_assign_polars(X, values, row_indexer, column_indexer) + else: # numpy array or sparse container + return _safe_assign_array(X, values, row_indexer, column_indexer) + + +def _safe_assign_array(X, values, row_indexer, column_indexer): + """Helper for `_safe_assign` when `X` is numpy array or sparse container.""" + if np.isscalar(column_indexer) and np.isscalar(row_indexer): + X[row_indexer, column_indexer] = values + return X + + if np.isscalar(column_indexer): + column_indexer = [column_indexer] + if not np.isscalar(values): + values = np.asarray(values)[:, None] + elif np.isscalar(row_indexer): + row_indexer = [row_indexer] + if not np.isscalar(values): + values = np.asarray(values)[None, :] + + if row_indexer is None and column_indexer is None: + X[:, :] = values + elif row_indexer is None: # `column_indexer` is not None + X[:, column_indexer] = values + elif column_indexer is None: # `row_indexer` is not None + X[row_indexer, :] = values + else: # both `row_indexer` and `column_indexer` are not None + X[np.ix_(row_indexer, column_indexer)] = values + return X + + +def _safe_assign_pandas(X, values, row_indexer, column_indexer): + """Helper for `_safe_assign` when `X` is pandas dataframe.""" + row_indexer = slice(None) if row_indexer is None else row_indexer + column_indexer = slice(None) if column_indexer is None else column_indexer + + # pandas may match index when using `iloc` to set values on certain platforms; + # we want to avoid this behavior and behave as if `values` is just an array + if _is_pandas_df_or_series(values): + values = values.values + + with warnings.catch_warnings(): + # pandas >= 1.5 raises a warning when using iloc to set values in a column + # that does not have the same type as the column being set. It happens + # for instance when setting a categorical column with a string. + # In the future the behavior won't change and the warning should disappear. + # TODO(1.3): check if the warning is still raised or remove the filter. + warnings.simplefilter("ignore", FutureWarning) + X.iloc[row_indexer, column_indexer] = values + return X + + +def _safe_assign_polars(X, values, row_indexer, column_indexer): + """Helper for `_safe_assign` when `X` is polars dataframe.""" + pl = sys.modules["polars"] + row_indices = np.arange(X.shape[0])[ + slice(None) if row_indexer is None else row_indexer + ] + + if np.isscalar(column_indexer): + if row_indexer is None and isinstance(values, pl.Series): + # Assigning a categorical series to a column of a dataframe does not + # work with the normal assignment method below; in fact with the current + # implementation it does not work except for the case of setting a whole + # column which is this special case + X.replace_column(column_indexer, values) + elif np.isscalar(row_indexer): + X[int(row_indices), column_indexer] = values + else: + X[row_indices, column_indexer] = values + return X + + row_indices = np.atleast_1d(row_indices) column_indexer = ( - slice(None, None, None) if column_indexer is None else column_indexer + slice(None) if column_indexer is None else np.atleast_1d(column_indexer) ) + column_indices = np.asarray(X.columns)[column_indexer] - if hasattr(X, "iloc"): # pandas dataframe - with warnings.catch_warnings(): - # pandas >= 1.5 raises a warning when using iloc to set values in a column - # that does not have the same type as the column being set. It happens - # for instance when setting a categorical column with a string. - # In the future the behavior won't change and the warning should disappear. - # TODO(1.3): check if the warning is still raised or remove the filter. - warnings.simplefilter("ignore", FutureWarning) - X.iloc[row_indexer, column_indexer] = values - else: # numpy array or sparse matrix - X[row_indexer, column_indexer] = values + if np.isscalar(values): + + def get_expr(vals, col): + expr = pl + for row_ind in row_indices: + expr = expr.when(index=row_ind).then(values) + return expr.otherwise(pl.col(col)).alias(col) + + expressions = (get_expr(None, col) for col in column_indices) + else: + + def get_expr(vals, col): + expr = pl + # TODO: `strict=True` when minimum supported Python version is 3.10 + for val, row_ind in zip(vals, row_indices): + expr = expr.when(index=row_ind).then(val) + return expr.otherwise(pl.col(col)).alias(col) + + # TODO: `strict=True` when minimum supported Python version is 3.10 + expressions = ( + get_expr(vals, col) + for vals, col in zip(np.atleast_2d(values).T, column_indices) + ) + + return X.with_columns(X.with_row_index().select(expressions)) def _get_column_indices_for_bool_or_int(key, n_columns): diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py index 42757dbb00fae..01d96446e97f9 100644 --- a/sklearn/utils/_set_output.py +++ b/sklearn/utils/_set_output.py @@ -106,6 +106,20 @@ def hstack(self, Xs): Stacked containers. """ + def copy(self, X): + """Create a copy of the container. + + Parameters + ---------- + X : container + Container to copy. + + Returns + ------- + X_copy : container + Copy of the container. + """ + class PandasAdapter: container_lib = "pandas" @@ -148,6 +162,9 @@ def hstack(self, Xs): pd = check_library_installed("pandas") return pd.concat(Xs, axis=1) + def copy(self, X): + return X.copy() + class PolarsAdapter: container_lib = "polars" @@ -179,6 +196,9 @@ def hstack(self, Xs): pl = check_library_installed("polars") return pl.concat(Xs, how="horizontal") + def copy(self, X): + return X.clone() + class ContainerAdaptersManager: def __init__(self): @@ -198,11 +218,7 @@ def register(self, adapter): def _get_adapter_from_container(container): - """Get the adapter that knows how to handle such container. - - See :class:`sklearn.utils._set_output.ContainerAdapterProtocol` for more - details. - """ + """Get the adapter that knows how to handle such container.""" module_name = container.__class__.__module__.split(".")[0] try: return ADAPTERS_MANAGER.adapters[module_name] diff --git a/sklearn/utils/tests/test_indexing.py b/sklearn/utils/tests/test_indexing.py index c2cdf24817cac..a8b43a76ccafb 100644 --- a/sklearn/utils/tests/test_indexing.py +++ b/sklearn/utils/tests/test_indexing.py @@ -376,38 +376,86 @@ def test_safe_indexing_list_axis_1_unsupported(indices): _safe_indexing(X, indices, axis=1) -@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"]) -def test_safe_assign(array_type): +@pytest.mark.parametrize("array_type", ["array", "sparse_csr", "pandas", "polars"]) +@pytest.mark.parametrize( + "column_indexer", [None, 2, [0, 2], [True, False, True, False, False]] +) +@pytest.mark.parametrize( + "row_indexer", [None, 2, [0, 2], [True, False, True, False, False]] +) +# Whether the value to assign is a scalar to be broadcasted or an array +@pytest.mark.parametrize("scalar_value", [True, False]) +def test_safe_assign(array_type, column_indexer, row_indexer, scalar_value): """Check that `_safe_assign` works as expected.""" rng = np.random.RandomState(0) - X_array = rng.randn(10, 5) - - row_indexer = [1, 2] - values = rng.randn(len(row_indexer), X_array.shape[1]) - X = _convert_container(X_array, array_type) - _safe_assign(X, values, row_indexer=row_indexer) - - assigned_portion = _safe_indexing(X, row_indexer, axis=0) - assert_allclose_dense_sparse( - assigned_portion, _convert_container(values, array_type) - ) - - column_indexer = [1, 2] - values = rng.randn(X_array.shape[0], len(column_indexer)) - X = _convert_container(X_array, array_type) - _safe_assign(X, values, column_indexer=column_indexer) - - assigned_portion = _safe_indexing(X, column_indexer, axis=1) - assert_allclose_dense_sparse( - assigned_portion, _convert_container(values, array_type) - ) + size = 5 + select_size = 2 # size after indexing if integer or boolean indexer + X_array = rng.randn(size, size) + + if np.isscalar(row_indexer) and np.isscalar(column_indexer): + values = rng.randn() + target_values = values + elif np.isscalar(row_indexer) or np.isscalar(column_indexer): + if np.isscalar(row_indexer): + target_size = size if column_indexer is None else select_size + else: + target_size = size if row_indexer is None else select_size + if scalar_value: + values = rng.randn() + target_values = np.full(target_size, values) + else: + values = rng.randn(target_size) + target_values = values + else: + target_shape = ( + size if row_indexer is None else select_size, + size if column_indexer is None else select_size, + ) + if scalar_value: + values = rng.randn() + target_values = np.full(target_shape, values) + else: + values = rng.randn(*target_shape) + target_values = values - row_indexer, column_indexer = None, None - values = rng.randn(*X.shape) X = _convert_container(X_array, array_type) - _safe_assign(X, values, column_indexer=column_indexer) - - assert_allclose_dense_sparse(X, _convert_container(values, array_type)) + X = _safe_assign(X, values, row_indexer=row_indexer, column_indexer=column_indexer) + + if np.isscalar(row_indexer) and np.isscalar(column_indexer): + if array_type == "pandas": + assert X.iloc[row_indexer, column_indexer] == target_values + else: + assert X[row_indexer, column_indexer] == target_values + elif np.isscalar(row_indexer) or np.isscalar(column_indexer): + if np.isscalar(row_indexer): + if "sparse" in array_type: + assigned_portion = _safe_indexing(X, [row_indexer]) + assigned_portion = _safe_indexing( + assigned_portion, column_indexer, axis=1 + ) + assert_array_equal(assigned_portion.toarray().ravel(), target_values) + else: + assigned_portion = _safe_indexing(X, row_indexer) + assigned_portion = _safe_indexing(assigned_portion, column_indexer) + assert_array_equal(assigned_portion, target_values) + else: # np.isscalar(column_indexer) + assigned_portion = _safe_indexing(X, row_indexer) + if "sparse" in array_type: + assigned_portion = _safe_indexing( + assigned_portion, [column_indexer], axis=1 + ) + assert_array_equal(assigned_portion.toarray().ravel(), target_values) + else: + assigned_portion = _safe_indexing( + assigned_portion, column_indexer, axis=1 + ) + assert_array_equal(assigned_portion, target_values) + else: # none of the indexers are scalars + assigned_portion = _safe_indexing(X, row_indexer, axis=0) + assigned_portion = _safe_indexing(assigned_portion, column_indexer, axis=1) + assert_allclose_dense_sparse( + assigned_portion, _convert_container(target_values, array_type) + ) @pytest.mark.parametrize( diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index 360b081a2a0fb..ece633fd1ba49 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -454,7 +454,7 @@ def patched_import_module(name): def test_get_adapter_from_container(): - """Check the behavior fo `_get_adapter_from_container`.""" + """Check the behavior of `_get_adapter_from_container`.""" pd = pytest.importorskip("pandas") X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]}) adapter = _get_adapter_from_container(X)