8000 ENH Adds feature_names_out to preprocessing module by thomasjpfan · Pull Request #21079 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH Adds feature_names_out to preprocessing module #21079

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
6bd9df5
ENH Adds feature_names_out to preprocessing module
thomasjpfan Sep 17, 2021
fcd7e70
DOC Adds whats new
thomasjpfan Sep 17, 2021
88ea6ad
DOC Adds whats new
thomasjpfan Sep 17, 2021
e87920b
Merge branch 'main' into feature_names_out_preprocessing
ogrisel Sep 24, 2021
a704c82
DOC remove redundant code in GPR example (#21133)
zhaofeng-shu33 Sep 24, 2021
7eba091
DOC Ensures that RandomizedSearchCV passes numpydoc validation (#21131)
nikJ13 Sep 24, 2021
321e682
Merge remote-tracking branch 'upstream/main' into feature_names_out_p…
thomasjpfan Sep 24, 2021
19e4ff1
CLN Address comments
thomasjpfan Sep 24, 2021
02193bc
Merge remote-tracking branch 'upstream/main' into feature_names_out_p…
thomasjpfan Oct 1, 2021
bcadc08
Merge remote-tracking branch 'upstream/main' into feature_names_out_p…
thomasjpfan Oct 15, 2021
ed04543
Merge remote-tracking branch 'upstream/main' into feature_names_out_p…
thomasjpfan Oct 19, 2021
82d19c7
DOC Fix grammar
thomasjpfan Oct 19, 2021
0e7c6db
Merge remote-tracking branch 'upstream/main' into feature_names_out_p…
thomasjpfan Oct 20, 2021
ff31d1b
CLN Address comments
thomasjpfan Oct 20, 2021
48a8aac
Merge remote-tracking branch 'upstream/main' into feature_names_out_p…
thomasjpfan Oct 20, 2021
01a32d3
REV Revert unneeded change
thomasjpfan Oct 20, 2021
ef93b22
Merge remote-tracking branch 'upstream/main' into feature_names_out_p…
thomasjpfan Oct 26, 2021
61cde09
CLN Merge conflict
thomasjpfan Oct 26, 2021
a5c7ef4
Merge branch 'main' into feature_names_out_preprocessing
ogrisel Nov 5, 2021
b9831d3
Merge branch 'main' into feature_names_out_preprocessing
ogrisel Nov 5, 2021
a3147d8
Merge branch 'main' into feature_names_out_preprocessing
ogrisel Dec 2, 2021
2cd55e9
Make KernelCenterer inherit from _ClassNamePrefixFeaturesOutMixin
ogrisel Dec 6, 2021
dbbe2d8
Merge remote-tracking branch 'upstream/main' into feature_names_out_p…
thomasjpfan Dec 6, 2021
3587c45
Merge remote-tracking branch 'upstream/main' into feature_names_out_p…
thomasjpfan Dec 6, 2021
35f32aa
Merge remote-tracking branch 'upstream/main' into feature_names_out_p…
thomasjpfan Jan 4, 2022
8778438
DOC Adjust merge
thomasjpfan Jan 4, 2022
87bde75
Merge branch 'main' of https://github.com/scikit-learn/scikit-learn i…
lesteve Feb 4, 2022
683f849
Merge branch 'main' of https://github.com/scikit-learn/scikit-learn i…
lesteve Feb 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,12 @@ Changelog
`fit` instead of `__init__`.
:pr:`21434` by :user:`Krum Arnaudov <krumeto>`.

- |API| Adds :meth:`get_feature_names_out` to
:class:`preprocessing.Normalizer`,
:class:`preprocessing.KernelCenterer`,
:class:`preprocessing.OrdinalEncoder`, and
:class:`preprocessing.Binarizer`. :pr:`21079` by `Thomas Fan`_.

:mod:`sklearn.random_projection`
................................

Expand Down
10000
22 changes: 18 additions & 4 deletions sklearn/preprocessing/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
from scipy import optimize
from scipy.special import boxcox

from ..base import BaseEstimator, TransformerMixin, _OneToOneFeatureMixin
from ..base import (
BaseEstimator,
TransformerMixin,
_OneToOneFeatureMixin,
_ClassNamePrefixFeaturesOutMixin,
)
from ..utils import check_array
from ..utils.deprecation import deprecated
from ..utils.extmath import _incremental_mean_and_var, row_norms
Expand Down Expand Up @@ -1825,7 +1830,7 @@ def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False):
return X


class Normalizer(TransformerMixin, BaseEstimator):
class Normalizer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
"""Normalize samples individually to unit norm.

Each sample (i.e. each row of the data matrix) with at least one
Expand Down Expand Up @@ -1996,7 +2001,7 @@ def binarize(X, *, threshold=0.0, copy=True):
return X


class Binarizer(TransformerMixin, BaseEstimator):
class Binarizer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
"""Binarize data (set feature values to 0 or 1) according to a threshold.

Values greater than the threshold map to 1, while values less than
Expand Down Expand Up @@ -2119,7 +2124,7 @@ def _more_tags(self):
return {"stateless": True}


class KernelCenterer(TransformerMixin, BaseEstimator):
class KernelCenterer(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
r"""Center an arbitrary kernel matrix :math:`K`.

Let define a kernel :math:`K` such that:
Expand Down Expand Up @@ -2258,6 +2263,15 @@ def transform(self, K, copy=True):

return K

@property
def _n_features_out(self):
"""Number of transformed output features."""
# Used by _ClassNamePrefixFeaturesOutMixin. This model preserves the
# number of input features but this is not a one-to-one mapping in the
# usual sense. Hence the choice not to use _OneToOneFeatureMixin to
# implement get_feature_names_out for this class.
return self.n_features_in_

def _more_tags(self):
return {"pairwise": True}

Expand Down
4 changes: 2 additions & 2 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from scipy import sparse
import numbers

from ..base import BaseEstimator, TransformerMixin
from ..base import BaseEstimator, TransformerMixin, _OneToOneFeatureMixin
from ..utils import check_array, is_scalar_nan
from ..utils.deprecation import deprecated
from ..utils.validation import check_is_fitted
Expand Down Expand Up @@ -731,7 +731,7 @@ def get_feature_names_out(self, input_features=None):
return np.asarray(feature_names, dtype=object)


class OrdinalEncoder(_BaseEncoder):
class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder):
8000
"""
Encode categorical features as an integer array.

Expand Down
16 changes: 16 additions & 0 deletions sklearn/preprocessing/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from sklearn.preprocessing import power_transform
from sklearn.preprocessing._data import _handle_zeros_in_scale
from sklearn.preprocessing._data import BOUNDS_THRESHOLD
from sklearn.metrics.pairwise import linear_kernel

from sklearn.exceptions import NotFittedError

Expand Down Expand Up @@ -2672,6 +2673,8 @@ def test_one_to_one_features(Transformer):
StandardScaler,
QuantileTransformer,
PowerTransformer,
Normalizer,
Binarizer,
],
)
def test_one_to_one_features_pandas(Transformer):
Expand All @@ -2691,3 +2694,16 @@ def test_one_to_one_features_pandas(Transformer):
with pytest.raises(ValueError, match=msg):
invalid_names = list("abcd")
tr.get_feature_names_out(invalid_names)


def test_kernel_centerer_feature_names_out():
"""Test that kernel centerer `feature_names_out`."""

rng = np.random.RandomState(0)
X = rng.random_sample((6, 4))
X_pairwise = linear_kernel(X)
centerer = KernelCenterer().fit(X_pairwise)

names_out = centerer.get_feature_names_out()
samples_out2 = X_pairwise.shape[1]
assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)])
12 changes: 12 additions & 0 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1387,3 +1387,15 @@ def test_ordinal_encoder_python_integer():
assert_array_equal(encoder.categories_, np.sort(X, axis=0).T)
X_trans = encoder.transform(X)
assert_array_equal(X_trans, [[0], [3], [2], [1]])


def test_ordinal_encoder_features_names_out_pandas():
"""Check feature names out is same as the input."""
pd = pytest.importorskip("pandas")

names = ["b", "c", "a"]
X = pd.DataFrame([[1, 2, 3]], columns=names)
enc = OrdinalEncoder().fit(X)

feature_names_out = enc.get_feature_names_out()
assert_array_equal(names, feature_names_out)
1 change: 0 additions & 1 deletion sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,6 @@ def test_pandas_column_name_consistency(estimator):
GET_FEATURES_OUT_MODULES_TO_IGNORE = [
"ensemble",
"kernel_approximation",
"preprocessing",
]


Expand Down
10 changes: 7 additions & 3 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1828,7 +1828,9 @@ def _get_feature_names(X):


def _check_feature_names_in(estimator, input_features=None, *, generate_names=True):
"""Get output feature names for transformation.
"""Check `input_features` and generate names if needed.

Commonly used in :term:`get_feature_names_out`.

Parameters
----------
Expand All @@ -1842,8 +1844,10 @@ def _check_feature_names_in(estimator, input_features=None, *, generate_names=Tr
match `feature_names_in_` if `feature_names_in_` is defined.

generate_names : bool, default=True
Wether to generate names when `input_features` is `None` and
`estimator.feature_names_in_` is not defined.
Whether to generate names when `input_features` is `None` and
`estimator.feature_names_in_` is not defined. This is useful for transformers
that validates `input_features` but do not require them in
:term:`get_feature_names_out` e.g. `PCA`.

Returns
-------
Expand Down
0