8000 ENH Adds feature_names_out to preprocessing module (#21079) · glemaitre/scikit-learn@2f1b9bb · GitHub
[go: up one dir, main page]

Skip to content

Commit 2f1b9bb

Browse files
thomasjpfanogriselzhaofeng-shu33nikJ13lesteve
authored andcommitted
ENH Adds feature_names_out to preprocessing module (scikit-learn#21079)
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org> Co-authored-by: 赵丰 (Zhao Feng) <616545598@qq.com> Co-authored-by: Niket Jain <51831161+nikJ13@users.noreply.github.com> Co-authored-by: Loïc Estève <loic.esteve@ymail.com>
1 parent 60a4274 commit 2f1b9bb

File tree

7 files changed

+61
-10
lines changed
10000

7 files changed

+61
-10
lines changed

doc/whats_new/v1.1.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,12 @@ Changelog
559559
`fit` instead of `__init__`.
560560
:pr:`21434` by :user:`Krum Arnaudov <krumeto>`.
561561

562+
- |API| Adds :meth:`get_feature_names_out` to
563+
:class:`preprocessing.Normalizer`,
564+
:class:`preprocessing.KernelCenterer`,
565+
:class:`preprocessing.OrdinalEncoder`, and
566+
:class:`preprocessing.Binarizer`. :pr:`21079` by `Thomas Fan`_.
567+
562568
:mod:`sklearn.random_projection`
563569
................................
564570

sklearn/preprocessing/_data.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,12 @@
1616
from scipy import optimize
1717
from scipy.special import boxcox
1818

19-
from ..base import BaseEstimator, TransformerMixin, _OneToOneFeatureMixin
19+
from ..base import (
20+
BaseEstimator,
21+
TransformerMixin,
22+
_OneToOneFeatureMixin,
23+
_ClassNamePrefixFeaturesOutMixin,
24+
)
2025
from ..utils import check_array
2126
from ..utils.deprecation import deprecated
2227
from ..utils.extmath import _incremental_mean_and_var, row_norms
@@ -1825,7 +1830,7 @@ def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False):
18251830
return X
18261831

18271832

1828-
class Normalizer(TransformerMixin, BaseEstimator):
1833+
class Normalizer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
18291834
"""Normalize samples individually to unit norm.
18301835
18311836
Each sample (i.e. each row of the data matrix) with at least one
@@ -1996,7 +2001,7 @@ def binarize(X, *, threshold=0.0, copy=True):
19962001
return X
19972002

19982003

1999-
class Binarizer(TransformerMixin, BaseEstimator):
2004+
class Binarizer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
20002005
"""Binarize data (set feature values to 0 or 1) according to a threshold.
20012006
20022007
Values greater than the threshold map to 1, while values less than
@@ -2119,7 +2124,7 @@ def _more_tags(self):
21192124
return {"stateless": True}
21202125

21212126

2122-
class KernelCenterer(TransformerMixin, BaseEstimator):
2127+
class KernelCenterer(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
21232128
r"""Center an arbitrary kernel matrix :math:`K`.
21242129
21252130
Let define a kernel :math:`K` such that:
@@ -2258,6 +2263,15 @@ def transform(self, K, copy=True):
22582263

22592264
return K
22602265

2266+
@property
2267+
def _n_features_out(self):
2268+
"""Number of transformed output features."""
2269+
# Used by _ClassNamePrefixFeaturesOutMixin. This model preserves the
2270+
# number of input features but this is not a one-to-one mapping in the
2271+
# usual sense. Hence the choice not to use _OneToOneFeatureMixin to
2272+
# implement get_feature_names_out for this class.
2273+
return self.n_features_in_
2274+
22612275
def _more_tags(self):
22622276
return {"pairwise": True}
22632277

sklearn/preprocessing/_encoders.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from scipy import sparse
88
import numbers
99

10-
from ..base import BaseEstimator, TransformerMixin
10+
from ..base import BaseEstimator, TransformerMixin, _OneToOneFeatureMixin
1111
from ..utils import check_array, is_scalar_nan
1212
from ..utils.deprecation import deprecated
1313
from ..utils.validation import check_is_fitted
@@ -731,7 +731,7 @@ def get_feature_names_out(self, input_features=None):
731731
return np.asarray(feature_names, dtype=object)
732732

733733

734-
class OrdinalEncoder(_BaseEncoder):
734+
class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder):
735735
"""
736736
Encode categorical features as an integer array.
737737

sklearn/preprocessing/tests/test_data.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
from sklearn.preprocessing import power_transform
4646
from sklearn.preprocessing._data import _handle_zeros_in_scale
4747
from sklearn.preprocessing._data import BOUNDS_THRESHOLD
48+
from sklearn.metrics.pairwise import linear_kernel
4849

4950
from sklearn.exceptions import NotFittedError
5051

@@ -2672,6 +2673,8 @@ def test_one_to_one_features(Transformer):
26722673
StandardScaler,
26732674
QuantileTransformer,
26742675
PowerTransformer,
2676+
Normalizer,
2677+
Binarizer,
26752678
],
26762679
)
26772680
def test_one_to_one_features_pandas(Transformer):
@@ -2691,3 +2694,16 @@ def test_one_to_one_features_pandas(Transformer):
26912694
with pytest.raises(ValueError, match=msg):
26922695
invalid_names = list("abcd")
26932696
tr.get_feature_names_out(invalid_names)
2697+
2698+
2699+
def test_kernel_centerer_feature_names_out():
2700+
"""Test that kernel centerer `feature_names_out`."""
2701+
2702+
rng = np.random.RandomState(0)
2703+
X = rng.random_sample((6, 4))
2704+
X_pairwise = linear_kernel(X)
2705+
centerer = KernelCenterer().fit(X_pairwise)
2706+
2707+
names_out = centerer.get_feature_names_out()
2708+
samples_out2 = X_pairwise.shape[1]
2709+
assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)])

sklearn/preprocessing/tests/test_encoders.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1387,3 +1387,15 @@ def test_ordinal_encoder_python_integer():
13871387
assert_array_equal(encoder.categories_, np.sort(X, axis=0).T)
13881388
X_trans = encoder.transform(X)
13891389
assert_array_equal(X_trans, [[0], [3], [2], [1]])
1390+
1391+
1392+
def test_ordinal_encoder_features_names_out_pandas():
1393+
"""Check feature names out is same as the input."""
1394+
pd = pytest.importorskip("pandas")
1395+
1396+
names = ["b", "c", "a"]
1397+
X = pd.DataFrame([[1, 2, 3]], columns=names)
1398+
enc = OrdinalEncoder().fit(X)
1399+
1400+
feature_names_out = enc.get_feature_names_out()
1401+
assert_array_equal(names, feature_names_out)

sklearn/tests/test_common.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,6 @@ def test_pandas_column_name_consistency(estimator):
382382
GET_FEATURES_OUT_MODULES_TO_IGNORE = [
383383
"ensemble",
384384
"kernel_approximation",
385-
"preprocessing",
386385
]
387386

388387

sklearn/utils/validation.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1828,7 +1828,9 @@ def _get_feature_names(X):
18281828

18291829

18301830
def _check_feature_names_in(estimator, input_features=None, *, generate_names=True):
1831-
"""Get output feature names for transformation.
1831+
"""Check `input_features` and generate names if needed.
1832+
1833+
Commonly used in :term:`get_feature_names_out`.
18321834
18331835
Parameters
18341836
----------
@@ -1842,8 +1844,10 @@ def _check_feature_names_in(estimator, input_features=None, *, generate_names=Tr
18421844
match `feature_names_in_` if `feature_names_in_` is defined.
18431845
18441846
generate_names : bool, default=True
1845-
Wether to generate names when `input_features` is `None` and
1846-
`estimator.feature_names_in_` is not defined.
1847+
Whether to generate names when `input_features` is `None` and
1848+
`estimator.feature_names_in_` is not defined. This is useful for transformers
1849+
that validates `input_features` but do not require them in
1850+
:term:`get_feature_names_out` e.g. `PCA`.
18471851
18481852
Returns
18491853
-------

0 commit comments

Comments
 (0)
0