diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 87b0441bade5f..b90f159fb664a 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -438,6 +438,10 @@ Changelog - |Feature| :class:`preprocessing.OrdinalEncoder` supports passing through missing values by default. :pr:`19069` by `Thomas Fan`_. +- |Feature| Transformers in the :mod:`sklearn.preprocessing` have a `array_out` + kwargs in :term:`transform` that can be set to `'pandas'` to output + DataFrames. :pr:`20100` by `Thomas Fan`_. + - |Fix| :func:`preprocessing.scale`, :class:`preprocessing.StandardScaler` and similar scalers detect near-constant features to avoid scaling them to very large values. This problem happens in particular when using a scaler on diff --git a/sklearn/base.py b/sklearn/base.py index e8b51df634a1f..88ba88cb610d8 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -23,6 +23,7 @@ from .utils.validation import check_array from .utils.validation import _num_features from .utils._estimator_html_repr import estimator_html_repr +from .utils._array_out import _get_feature_names def clone(estimator, *, safe=True): @@ -376,6 +377,33 @@ def _check_n_features(self, X, reset): f"X has {n_features} features, but {self.__class__.__name__} " f"is expecting {self.n_features_in_} features as input.") + def _check_feature_names(self, X, reset=True): + """Set the `feature_names_in_` attribute, or check against it. + + Parameters + ---------- + X : array-like + The input samples. + reset : bool, default=True + If True, the `n_feature_names_` attribute is set to the feature + names of `X`. + Else, the attribute must already exist and the function checks + that it is equal to the feature names of `X`. + """ + feature_names = _get_feature_names(X) + if reset: + self.feature_names_in_ = feature_names + return + + if (not hasattr(self, 'feature_names_in_') or + self.feature_names_in_ is None or + feature_names is None): + return + + if any(feature_names != self.feature_names_in_): + raise ValueError("The input's feature names does not match the " + "feature_names_in_ attribute.") + def _validate_data(self, X, y='no_validation', reset=True, validate_separately=False, **check_params): """Validate input data and set or check the `n_features_in_` attribute. @@ -418,6 +446,7 @@ def _validate_data(self, X, y='no_validation', reset=True, out : {ndarray, sparse matrix} or tuple of these The validated input. A tuple is returned if `y` is not None. """ + self._check_feature_names(X, reset=reset) if y is None: if self._get_tags()['requires_y']: @@ -678,7 +707,7 @@ def get_submatrix(self, i, data): class TransformerMixin: """Mixin class for all transformers in scikit-learn.""" - def fit_transform(self, X, y=None, **fit_params): + def fit_transform(self, X, y=None, array_out="default", **fit_params): """ Fit to data, then transform it. @@ -694,6 +723,11 @@ def fit_transform(self, X, y=None, **fit_params): default=None Target values (None for unsupervised transformations). + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + **fit_params : dict Additional fit parameters. @@ -706,10 +740,20 @@ def fit_transform(self, X, y=None, **fit_params): # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) - return self.fit(X, **fit_params).transform(X) + fitted = self.fit(X, **fit_params) else: # fit method of arity 2 (supervised transformation) - return self.fit(X, y, **fit_params).transform(X) + fitted = self.fit(X, y, **fit_params) + + if array_out == "default": + return fitted.transform(X) + + # array_out != "default" + transform_params = inspect.signature(fitted.transform).parameters + if "array_out" not in transform_params: + raise ValueError("Transform does not support array_out") + + return fitted.transform(X, array_out=array_out) class DensityMixin: diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 393693fc87d2d..874321fa3ed70 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -29,6 +29,7 @@ from ..utils.validation import (check_is_fitted, check_random_state, _check_sample_weight, FLOAT_DTYPES) +from ..utils._array_out import _array_out_wrap from ._encoders import OneHotEncoder @@ -440,7 +441,8 @@ def partial_fit(self, X, y=None): self.data_range_ = data_range return self - def transform(self, X): + @_array_out_wrap("one_to_one") + def transform(self, X, array_out="default"): """Scale features of X according to feature_range. Parameters @@ -448,6 +450,11 @@ def transform(self, X): X : array-like of shape (n_samples, n_features) Input data that will be transformed. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- Xt : ndarray of shape (n_samples, n_features) @@ -884,7 +891,8 @@ def partial_fit(self, X, y=None, sample_weight=None): return self - def transform(self, X, copy=None): + @_array_out_wrap("one_to_one") + def transform(self, X, copy=None, array_out="default"): """Perform standardization by centering and scaling Parameters @@ -893,6 +901,10 @@ def transform(self, X, copy=None): The data used to scale along the features axis. copy : bool, default=None Copy the input X or not. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. Returns ------- @@ -1103,7 +1115,8 @@ def partial_fit(self, X, y=None): self.scale_ = _handle_zeros_in_scale(max_abs, copy=True) return self - def transform(self, X): + @_array_out_wrap("one_to_one") + def transform(self, X, array_out="default"): """Scale the data Parameters @@ -1111,6 +1124,11 @@ def transform(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The data that should be scaled. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) @@ -1403,7 +1421,8 @@ def fit(self, X, y=None): return self - def transform(self, X): + @_array_out_wrap("one_to_one") + def transform(self, X, array_out="default"): """Center and scale the data. Parameters @@ -1411,6 +1430,11 @@ def transform(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The data used to scale along the specified axis. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) @@ -1755,7 +1779,8 @@ def fit(self, X, y=None): self._validate_data(X, accept_sparse='csr') return self - def transform(self, X, copy=None): + @_array_out_wrap("one_to_one") + def transform(self, X, copy=None, array_out="default"): """Scale each non zero row of X to unit norm Parameters @@ -1767,6 +1792,11 @@ def transform(self, X, copy=None): copy : bool, default=None Copy the input X or not. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) @@ -1909,7 +1939,8 @@ def fit(self, X, y=None): self._validate_data(X, accept_sparse='csr') return self - def transform(self, X, copy=None): + @_array_out_wrap("one_to_one") + def transform(self, X, copy=None, array_out="default"): """Binarize each element of X. Parameters @@ -1922,6 +1953,11 @@ def transform(self, X, copy=None): copy : bool Copy the input X or not. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) @@ -2033,7 +2069,8 @@ def fit(self, K, y=None): self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples return self - def transform(self, K, copy=True): + @_array_out_wrap("one_to_one") + def transform(self, K, copy=True, array_out="default"): """Center kernel matrix. Parameters @@ -2044,6 +2081,11 @@ def transform(self, K, copy=True): copy : bool, default=True Set to False to perform inplace computation. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- K_new : ndarray of shape (n_samples1, n_samples2) @@ -2500,7 +2542,8 @@ def _transform(self, X, inverse=False): return X - def transform(self, X): + @_array_out_wrap("one_to_one") + def transform(self, X, array_out="default"): """Feature-wise transformation of the data. Parameters @@ -2511,6 +2554,11 @@ def transform(self, X): ``csc_matrix``. Additionally, the sparse matrix needs to be nonnegative if `ignore_implicit_zeros` is False. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) @@ -2792,7 +2840,8 @@ def fit(self, X, y=None): self._fit(X, y=y, force_transform=False) return self - def fit_transform(self, X, y=None): + @_array_out_wrap("one_to_one") + def fit_transform(self, X, y=None, array_out="default"): return self._fit(X, y, force_transform=True) def _fit(self, X, y=None, force_transform=False): @@ -2825,7 +2874,8 @@ def _fit(self, X, y=None, force_transform=False): return X - def transform(self, X): + @_array_out_wrap("one_to_one") + def transform(self, X, array_out="default"): """Apply the power transform to each feature using the fitted lambdas. Parameters @@ -2833,6 +2883,11 @@ def transform(self, X): X : array-like of shape (n_samples, n_features) The data to be transformed using a power transformation. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- X_trans : ndarray of shape (n_samples, n_features) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index d7565ff2fb4b3..2dfe8b2cdc686 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -15,6 +15,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils.validation import check_array from ..utils.validation import check_is_fitted +from ..utils._array_out import _array_out_wrap class KBinsDiscretizer(TransformerMixin, BaseEstimator): @@ -269,7 +270,8 @@ def _validate_n_bins(self, n_features): .format(KBinsDiscretizer.__name__, indices)) return n_bins - def transform(self, X): + @_array_out_wrap(lambda self: self._encoder.get_feature_names) + def transform(self, X, array_out="default"): """ Discretize the data. @@ -278,6 +280,11 @@ def transform(self, X): X : array-like of shape (n_samples, n_features) Data to be discretized. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64} diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 385b4ed83d3eb..d745c4b42022c 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -10,6 +10,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array, is_scalar_nan from ..utils.validation import check_is_fitted +from ..utils._array_out import _array_out_wrap from ..utils._mask import _get_mask from ..utils._encode import _encode, _check_unknown, _unique @@ -75,6 +76,7 @@ def _get_feature(self, X, feature_idx): return X[:, feature_idx] def _fit(self, X, handle_unknown='error', force_all_finite=True): + self._check_feature_names(X) X_list, n_samples, n_features = self._check_X( X, force_all_finite=force_all_finite) @@ -112,6 +114,7 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True): def _transform(self, X, handle_unknown='error', force_all_finite=True, warn_on_unknown=False): + self._check_feature_names(X, reset=False) X_list, n_samples, n_features = self._check_X( X, force_all_finite=force_all_finite) @@ -426,7 +429,7 @@ def fit(self, X, y=None): self.drop_idx_ = self._compute_drop_idx() return self - def fit_transform(self, X, y=None): + def fit_transform(self, X, y=None, array_out="default"): """ Fit OneHotEncoder to X, then transform X. @@ -441,6 +444,11 @@ def fit_transform(self, X, y=None): Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- X_out : {ndarray, sparse matrix} of shape \ @@ -449,9 +457,10 @@ def fit_transform(self, X, y=None): returned. """ self._validate_keywords() - return super().fit_transform(X, y) + return super().fit_transform(X, y, array_out=array_out) - def transform(self, X): + @_array_out_wrap(lambda self: self.get_feature_names) + def transform(self, X, array_out="default"): """ Transform X using one-hot encoding. @@ -460,6 +469,11 @@ def transform(self, X): X : array-like of shape (n_samples, n_features) The data to encode. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- X_out : {ndarray, sparse matrix} of shape \ @@ -815,7 +829,8 @@ def fit(self, X, y=None): return self - def transform(self, X): + @_array_out_wrap("one_to_one") + def transform(self, X, array_out="default"): """ Transform X to ordinal codes. @@ -824,6 +839,11 @@ def transform(self, X): X : array-like of shape (n_samples, n_features) The data to encode. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- X_out : ndarray of shape (n_samples, n_features) diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 44ac0d2175c4c..2553075f4bc72 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -14,6 +14,7 @@ from ..utils import check_array from ..utils.fixes import linspace from ..utils.validation import check_is_fitted, FLOAT_DTYPES +from ..utils._array_out import _array_out_wrap from ._csr_polynomial_expansion import _csr_polynomial_expansion @@ -200,7 +201,8 @@ def fit(self, X, y=None): return self - def transform(self, X): + @_array_out_wrap(lambda self: self.get_feature_names) + def transform(self, X, array_out="default"): """Transform data to polynomial features. Parameters @@ -221,6 +223,11 @@ def transform(self, X): will be converted back to CSC prior to being returned, hence the preference of CSR. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- XP : {ndarray, sparse matrix} of shape (n_samples, NP) @@ -663,7 +670,8 @@ def fit(self, X, y=None): self.n_features_out_ = n_out - n_features * (1 - self.include_bias) return self - def transform(self, X): + @_array_out_wrap(lambda self: self.get_feature_names) + def transform(self, X, array_out="default"): """Transform each feature data to B-splines. Parameters @@ -671,6 +679,11 @@ def transform(self, X): X : array-like of shape (n_samples, n_features) The data to transform. + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + Returns ------- XBS : ndarray of shape (n_samples, n_features * n_splines) diff --git a/sklearn/tests/test_array_out_transformers.py b/sklearn/tests/test_array_out_transformers.py new file mode 100644 index 0000000000000..ddc58edec3130 --- /dev/null +++ b/sklearn/tests/test_array_out_transformers.py @@ -0,0 +1,147 @@ +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from sklearn.utils._testing import ignore_warnings +from sklearn.base import clone +from sklearn.utils import all_estimators +from sklearn.utils._testing import set_random_state +from sklearn.utils._testing import SkipTest +from sklearn.utils._testing import raises +from sklearn.utils.estimator_checks import _enforce_estimator_tags_x +from sklearn.utils.estimator_checks import _enforce_estimator_tags_y +from sklearn.utils.estimator_checks import _pairwise_estimator_convert_X +from sklearn.utils.estimator_checks import _set_checking_parameters +from sklearn.utils.estimator_checks import _construct_instance +from sklearn.utils.estimator_checks import _get_check_estimator_ids +from sklearn.utils._tags import _safe_tags + + +def check_array_out_pandas(estimator_orig): + """Check that array_out controls the output of a transformer.""" + try: + import pandas as pd + except ImportError: + raise SkipTest("pandas is not installed: not testing for " + "array_out with pandas") + + tags = _safe_tags(estimator_orig) + + supported_tags = {"2darray", "categorical"} + X_types_set = set(tags["X_types"]) + if not (supported_tags & X_types_set) or tags["no_validation"]: + return + + estimator = clone(estimator_orig) + set_random_state(estimator) + _set_checking_parameters(estimator) + + if "warm_start" in estimator.get_params(): + estimator.set_params(warm_start=False) + + n_samples, n_features = 150, 8 + + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + X = _enforce_estimator_tags_x(estimator, X) + X = _pairwise_estimator_convert_X(X, estimator) + + if "categorical" in X_types_set: + # simple transformation for categorical data + X = (((X - X.min()) * 3) // 3).astype(np.int32) + + y = None + if "requires_y" in tags: + y = rng.randint(low=0, high=2, size=n_samples) + y = _enforce_estimator_tags_y(estimator, y) + + feature_names = [f"feature_{i}" for i in range(X.shape[1])] + X_train_df = pd.DataFrame(X, columns=feature_names) + + # Call `fit` on dataframe + estimator.fit(X_train_df, y=y) + assert all(estimator.feature_names_in_ == feature_names) + + has_transform = hasattr(estimator, "transform") + has_fit_transform = hasattr(estimator, "fit_transform") + + test_index = [2*i for i in range(8)] + X_test_df = pd.DataFrame(X[rng.permutation(8), :], + columns=feature_names, index=test_index) + + if has_transform: + X_trans_df = estimator.transform(X_test_df, array_out='pandas') + assert isinstance(X_trans_df, pd.DataFrame) + assert_array_equal(X_trans_df.index, test_index) + + if has_fit_transform: + X_trans_df = estimator.fit_transform(X_train_df, y=y, + array_out='pandas') + assert isinstance(X_trans_df, pd.DataFrame) + assert_array_equal(X_trans_df.index, X_train_df.index) + + if has_transform: + # Check that `transform` fails with dataframe with different names. + X_test_invalid_df = pd.DataFrame(X[rng.permutation(8), :], + columns=feature_names[::-1]) + match = ("The input's feature names does not match the " + "feature_names_in_ attribute.") + with raises(ValueError, match=match): + estimator.transform(X_test_invalid_df) + + # Fit on ndarray (without feature names) + estimator.fit(X) + + if has_transform: + X_trans_df = estimator.transform(X_test_df, array_out='pandas') + assert isinstance(X_trans_df, pd.DataFrame) + assert_array_equal(X_trans_df.index, test_index) + + if has_fit_transform: + # Check that `fit_transform` also works + X_trans_df = estimator.fit_transform(X, y=y, array_out='pandas') + assert isinstance(X_trans_df, pd.DataFrame) + assert_array_equal(X_trans_df.index, range(150)) + + +ARRAY_OUT_TO_IGNORE = { + 'cluster', + 'compose', + 'cross_decomposition', + 'decomposition', + 'discriminant_analysis', + 'ensemble', + 'feature_extraction', + 'feature_selection', + 'impute', + 'isotonic', + 'kernel_approximation', + 'manifold', + 'neighbors', + 'neural_network', + 'pipeline', + 'random_projection', +} + + +def all_transformers_2d(): + with ignore_warnings(category=UserWarning): + estimators = all_estimators(type_filter="transformer") + + for name, Estimator in estimators: + module = Estimator.__module__.split(".")[1] + if module in ARRAY_OUT_TO_IGNORE: + continue + + try: + estimator = _construct_instance(Estimator) + except SkipTest: + continue + yield name, estimator + + +@ignore_warnings(category=UserWarning) +@pytest.mark.parametrize("name, estimator", all_transformers_2d(), + ids=_get_check_estimator_ids) +def test_array_out_pandas(name, estimator): + check_array_out_pandas(estimator) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index cc10f11fcd574..10800bc68e073 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -269,7 +269,8 @@ def test_fit_docstring_attributes(name, Estimator): est.fit(X, y) skipped_attributes = {'x_scores_', # For PLS, TODO remove in 1.1 - 'y_scores_'} # For PLS, TODO remove in 1.1 + 'y_scores_', # For PLS, TODO remove in 1.1 + 'feature_names_in_'} # Ignore for now module = est.__module__.split(".")[1] if module in N_FEATURES_MODULES_TO_IGNORE: diff --git a/sklearn/utils/_array_out.py b/sklearn/utils/_array_out.py new file mode 100644 index 0000000000000..74600263fed56 --- /dev/null +++ b/sklearn/utils/_array_out.py @@ -0,0 +1,122 @@ +from functools import wraps +from functools import partial +from inspect import signature + +import scipy.sparse as sp_sparse + + +def _get_feature_names(X): + """Get feature names of a dataframe.""" + if hasattr(X, "iloc"): # duck-type dataframe + return getattr(X, "columns", None) + + +def _get_index(X): + """Get the index of a dataframe.""" + if hasattr(X, "iloc"): # duck-type dataframe + return getattr(X, "index", None) + + +def _check_get_feature_names_out(get_feature_names_out, estimator, + n_feature_out): + """Check and convert get_feature_names_out into a callable.""" + if callable(get_feature_names_out): + get_feature_names_out_callable = get_feature_names_out(estimator) + elif get_feature_names_out == 'one_to_one': + def get_feature_names_out_callable(names): + return names + else: + # get_feature_names_out == 'class_name' + class_name = estimator.__class__.__name__.lower() + + def get_feature_names_out_callable(): + return [f"{class_name}{i}" for i in range(n_feature_out)] + + # feature names in can have zero or one argument. For one argument + # it would be the input feature names + parameters = signature(get_feature_names_out_callable).parameters + + if parameters: + feature_names_in = getattr(estimator, "feature_names_in_", None) + get_feature_names_out_callable = partial( + get_feature_names_out_callable, feature_names_in) + return get_feature_names_out_callable + + +def _make_array_out(X_out, index, get_feature_names_out, *, + array_out="default"): + """Construct array container based on global configuration. + + Parameters + ---------- + X_out: {ndarray, sparse matrix} of shape (n_samples, n_features_out) + Output data to be wrapped. + + index: array-like of shape (n_samples,) + Index of output data. + + get_features_names_out: callable + Returns the feature names out. If the callable returns None, then + the feature names will be ["X0", "X1", ...]. + + array_out : {"default", "pandas"}, default="default" + Specify the output array type. If "pandas", a pandas DataFrame is + returned. If "default", an array-like without feature names is + returned. + + Return + ------ + array_out: {ndarray, sparse matrix, dataframe} of shape \ + (n_samples, n_features_out) + Wrapped array with feature names. + """ + if array_out not in {'default', 'pandas'}: + raise ValueError("array_out must be 'default' or 'pandas'") + + if array_out == "default": + return X_out + + feature_names_out = get_feature_names_out() + if feature_names_out is None: + feature_names_out = [f'X{i}' for i in range(X_out.shape[1])] + + # array_out == "pandas" + import pandas as pd + if sp_sparse.issparse(X_out): + make_dataframe = pd.DataFrame.sparse.from_spmatrix + else: + make_dataframe = pd.DataFrame + + return make_dataframe(X_out, columns=feature_names_out, index=index) + + +def _array_out_wrap(get_feature_names_out): + """Wrap around transform method to create array_out. + + Parameters + ---------- + get_feature_names_out : callable or {"one_to_one", "class_name"} + Called to get the feature names out. If `one_to_one`, then the + feature_names_in will be used as feature name out. If `class_name`, + then the class name will be used as prefixes for the feature names + out. + """ + def _wrapper_transform(transform): + + @wraps(transform) + def inner_transform(*args, **kwargs): + array_out = kwargs.get("array_out", "default") + X_out = transform(*args, **kwargs) + + if array_out == "default": + return X_out + + estimator, X_orig = args[0], args[1] + index = _get_index(X_orig) + get_features = _check_get_feature_names_out( + get_feature_names_out, estimator, X_out.shape[1]) + + return _make_array_out(X_out, index, get_features, + array_out=array_out) + return inner_transform + return _wrapper_transform diff --git a/sklearn/utils/tests/test_array_out.py b/sklearn/utils/tests/test_array_out.py new file mode 100644 index 0000000000000..d737fdd8213cf --- /dev/null +++ b/sklearn/utils/tests/test_array_out.py @@ -0,0 +1,105 @@ +import numpy as np +import pytest +from scipy.sparse import csr_matrix +from numpy.testing import assert_array_equal + +from sklearn.utils._array_out import _get_feature_names +from sklearn.utils._array_out import _get_index +from sklearn.utils._array_out import _make_array_out +from sklearn.utils._testing import assert_allclose_dense_sparse + + +@pytest.mark.parametrize("X", [ + np.array([[1, 2, 3], [4, 5, 6]]), + [[1, 2, 3], [4, 5, 6]], + csr_matrix([[1, 0, 0], [0, 0, 1]]) +], ids=['ndarray', 'list', 'sparse']) +def test_feature_names_no_names(X): + assert _get_feature_names(X) is None + + +@pytest.mark.parametrize("X", [ + np.array([[1, 2, 3], [4, 5, 6]]), + [[1, 2, 3], [4, 5, 6]], + csr_matrix([[1, 0, 0], [0, 0, 1]]) +], ids=['ndarray', 'list', 'sparse']) +def test_get_index(X): + assert _get_index(X) is None + + +@pytest.mark.parametrize("feature_names", [ + ["feat_0", "feat_1", "feat_2"], + [1, 0, 2], +]) +def test_feature_names_pandas(feature_names): + pd = pytest.importorskip("pandas") + X = np.array([[1, 2, 3], [4, 5, 6]]) + X = pd.DataFrame(X, columns=feature_names) + + names = _get_feature_names(X) + assert_array_equal(names, feature_names) + + +@pytest.mark.parametrize("index", [ + [0, 1], ["a", "b"] +]) +def test_get_feature_names_pandas(index): + pd = pytest.importorskip("pandas") + X = np.array([[1, 2, 3], [4, 5, 6]]) + X = pd.DataFrame(X, index=index) + + index_names = _get_index(X) + assert_array_equal(index_names, index) + + +@pytest.mark.parametrize("X_out", [ + np.array([[1, 2, 3], [2, 3, 4]]), + csr_matrix([[1, 0, 0], [0, 0, 1]]) +], ids=['ndarray', 'sparse']) +def test_make_array_out_default(X_out): + out = _make_array_out(X_out, None, lambda: None, array_out="default") + assert out is X_out + + +@pytest.mark.parametrize("X_out", [ + np.array([[1, 2, 3], [2, 3, 4]]), + csr_matrix([[1, 0, 0], [0, 0, 1]]) +], ids=['ndarray', 'sparse']) +def test_make_array_out_error(X_out): + msg = "array_out must be 'default' or 'pandas'" + with pytest.raises(ValueError, match=msg): + _make_array_out(X_out, None, lambda: None, array_out="bad") + + +@pytest.mark.parametrize("is_sparse", [True, False]) +@pytest.mark.parametrize("out_features, expected_columns", [ + (['feat_1', 'feat_2'], ["feat_1", "feat_2"]), + ([0, 1], [0, 1]), + (None, ["X0", "X1"]), +]) +@pytest.mark.parametrize("index, expected_index", [ + ([2, 3, 1], [2, 3, 1]), + (["a", "c", "d"], ["a", "c", "d"]), + (None, [0, 1, 2]), +]) +def test_make_array_out_pandas(is_sparse, out_features, expected_columns, + index, expected_index): + pd = pytest.importorskip("pandas") + + X_out = np.array([[0, 1], [1, 0], [2, 3]]) + if is_sparse: + X_out = csr_matrix(X_out) + + df_out = _make_array_out(X_out, index, lambda: out_features, + array_out="pandas") + + assert isinstance(df_out, pd.DataFrame) + assert_array_equal(df_out.index, expected_index) + assert_array_equal(df_out.columns, expected_columns) + + if is_sparse: + unwrapped = df_out.sparse.to_coo() + else: + unwrapped = df_out.to_numpy() + + assert_allclose_dense_sparse(X_out, unwrapped)