E5F2 ENH Adds encoded_missing_value to OrdinalEncoder by thomasjpfan · Pull Request #21988 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content
30 changes: 28 additions & 2 deletions doc/modules/preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -537,8 +537,8 @@ scikit-learn estimators, as these expect continuous input, and would interpret
the categories as being ordered, which is often not desired (i.e. the set of
browsers was ordered arbitrarily).

:class:`OrdinalEncoder` will also passthrough missing values that are
indicated by `np.nan`.
By default, :class:`OrdinalEncoder` will also passthrough missing values that
are indicated by `np.nan`.

>>> enc = preprocessing.OrdinalEncoder()
>>> X = [['male'], ['female'], [np.nan], ['female']]
Expand All @@ -548,6 +548,32 @@ indicated by `np.nan`.
[nan],
[ 0.]])

:class:`OrdinalEncoder` provides a parameter `encoded_missing_value` to encode
the missing values without the need to create a pipeline and using
:class:`~sklearn.impute.SimpleImputer`.

>>> enc = preprocessing.OrdinalEncoder(encoded_missing_value=-1)
>>> X = [['male'], ['female'], [np.nan], ['female']]
>>> enc.fit_transform(X)
array([[ 1.],
[ 0.],
[-1.],
[ 0.]])

The above processing is equivalent to the following pipeline::

>>> from sklearn.pipeline import Pipeline
>>> from sklearn.impute import SimpleImputer
>>> enc = Pipeline(steps=[
... ("encoder", preprocessing.OrdinalEncoder()),
... ("imputer", SimpleImputer(strategy="constant", fill_value=-1)),
... ])
>>> enc.fit_transform(X)
array([[ 1.],
[ 0.],
[-1.],
[ 0.]])

Another possibility to convert categorical features to features that can be used
with scikit-learn estimators is to use a one-of-K, also known as one-hot or
dummy encoding.
Expand Down
3 changes: 3 additions & 0 deletions doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,9 @@ Changelog
the model. The option is only available when `strategy` is set to `quantile`.
:pr:`21445` by :user:`Felipe Bidu <fbidu>` and :user:`Amanda Dsouza <amy12xx>`.

- |Enhancement| Adds `encoded_missing_value` to :class:`preprocessing.OrdinalEncoder`
to configure the encoded value for missing data. :pr:`21988` by `Thomas Fan`_.

- |Enhancement| Added the `get_feature_names_out` method and a new parameter
`feature_names_out` to :class:`preprocessing.FunctionTransformer`. You can set
`feature_names_out` to 'one-to-one' to use the input features names as the
Expand Down
68 changes: 59 additions & 9 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,12 @@ class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder):

.. versionadded:: 0.24

encoded_missing_value : int or np.nan, default=np.nan
Encoded value of missing categories. If set to `np.nan`, then the `dtype`
parameter must be a float dtype.

.. versionadded:: 1.1

Attributes
----------
categories_ : list of arrays
Expand Down Expand Up @@ -1203,6 +1209,23 @@ class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder):
>>> enc.inverse_transform([[1, 0], [0, 1]])
array([['Male', 1],
['Female', 2]], dtype=object)

By default, :class:`OrdinalEncoder` is lenient towards missing values by
propagating them.

>>> import numpy as np
>>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]]
>>> enc.fit_transform(X)
array([[ 1., 0.],
[ 0., 1.],
[ 0., nan]])

You can use the parameter `encoded_missing_value` to encode missing values.

>>> enc.set_params(encoded_missing_value=-1).fit_transform(X)
array([[ 1., 0.],
[ 0., 1.],
[ 0., -1.]])
"""

def __init__(
Expand All @@ -1212,11 +1235,13 @@ def __init__(
dtype=np.float64,
handle_unknown="error",
unknown_value=None,
encoded_missing_value=np.nan,
):
self.categories = categories
self.dtype = dtype
self.handle_unknown = handle_unknown
self.unknown_value = unknown_value
self.encoded_missing_value = encoded_missing_value

def fit(self, X, y=None):
"""
Expand Down Expand Up @@ -1286,13 +1311,38 @@ def fit(self, X, y=None):
self._missing_indices[cat_idx] = i
continue

if np.dtype(self.dtype).kind != "f" and self._missing_indices:
raise ValueError(
"There are missing values in features "
f"{list(self._missing_indices)}. For OrdinalEncoder to "
"passthrough missing values, the dtype parameter must be a "
"float"
)
if self._missing_indices:
if np.dtype(self.dtype).kind != "f" and is_scalar_nan(
self.encoded_missing_value
):
raise ValueError(
"There are missing values in features "
f"{list(self._missing_indices)}. For OrdinalEncoder to "
f"encode missing values with dtype: {self.dtype}, set "
"encoded_missing_value to a non-nan value, or "
"set dtype to a float"
)

if not is_scalar_nan(self.encoded_missing_value):
# Features are invalid when they contain a missing category
# and encoded_missing_value was already used to encode a
# known category
invalid_features = [
cat_idx
for cat_idx, categories_for_idx in enumerate(self.categories_)
if cat_idx in self._missing_indices
and 0 <= self.encoded_missing_value < len(categories_for_idx)
]

if invalid_features:
# Use feature names if they are avaliable
if hasattr(self, "feature_names_in_"):
B0C8 invalid_features = self.feature_names_in_[invalid_features]
raise ValueError(
f"encoded_missing_value ({self.encoded_missing_value}) "
"is already used to encode a known category in features: "
f"{invalid_features}"
)

return self

Expand All @@ -1317,7 +1367,7 @@ def transform(self, X):

for cat_idx, missing_idx in self._missing_indices.items():
X_missing_mask = X_int[:, cat_idx] == missing_idx
X_trans[X_missing_mask, cat_idx] = np.nan
X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value

# create separate category for unknown values
if self.handle_unknown == "use_encoded_value":
Expand Down Expand Up @@ -1362,7 +1412,7 @@ def inverse_transform(self, X):

# replace values of X[:, i] that were nan with actual indices
if i in self._missing_indices:
X_i_mask = _get_mask(X[:, i], np.nan)
X_i_mask = _get_mask(X[:, i], self.encoded_missing_value)
labels[X_i_mask] = self._missing_indices[i]

if self.handle_unknown == "use_encoded_value":
Expand Down
67 changes: 59 additions & 8 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1664,31 +1664,35 @@ def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():

msg = (
r"There are missing values in features \[0\]. For OrdinalEncoder "
"to passthrough missing values, the dtype parameter must be a "
"float"
f"to encode missing values with dtype: {np.int32}"
)
with pytest.raises(ValueError, match=msg):
oe.fit(X)


def test_ordinal_encoder_passthrough_missing_values_float():
@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
def test_ordinal_encoder_passthrough_missing_values_float(encoded_missing_value):
"""Test ordinal encoder with nan on float dtypes."""

X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
oe = OrdinalEncoder().fit(X)
oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(X)

assert len(oe.categories_) == 1

assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])

X_trans = oe.transform(X)
assert_allclose(X_trans, [[np.nan], [1.0], [0.0], [1.0]])
assert_allclose(X_trans, [[encoded_missing_value], [1.0], [0.0], [1.0]])

X_inverse = oe.inverse_transform(X_trans)
assert_allclose(X_inverse, X)


@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
def test_ordinal_encoder_missing_value_support_pandas_categorical(
pd_nan_type, encoded_missing_value
):
"""Check ordinal encoder is compatible with pandas."""
# checks pandas dataframe with categorical features
pd = pytest.importorskip("pandas")
Expand All @@ -1701,14 +1705,14 @@ def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
}
)

oe = OrdinalEncoder().fit(df)
oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(df)
assert len(oe.categories_) == 1
assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"])
assert np.isnan(oe.categories_[0][-1])

df_trans = oe.transform(df)

assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]])
assert_allclose(df_trans, [[2.0], [0.0], [encoded_missing_value], [1.0], [0.0]])

X_inverse = oe.inverse_transform(df_trans)
assert X_inverse.shape == (5, 1)
Expand Down Expand Up @@ -1902,3 +1906,50 @@ def test_ordinal_encoder_features_names_out_pandas():

feature_names_out = enc.get_feature_names_out()
assert_array_equal(names, feature_names_out)


def test_ordinal_encoder_unknown_missing_interaction():
"""Check interactions between encode_unknown and missing value encoding."""

X = np.array([["a"], ["b"], [np.nan]], dtype=object)

oe = OrdinalEncoder(
handle_unknown="use_encoded_value",
unknown_value=np.nan,
encoded_missing_value=-3,
).fit(X)

X_trans = oe.transform(X)
assert_allclose(X_trans, [[0], [1], [-3]])

# "c" is unknown and is mapped to np.nan
# "None" is a missing value and is set to -3
X_test = np.array([["c"], [np.nan]], dtype=object)
X_test_trans = oe.transform(X_test)
assert_allclose(X_test_trans, [[np.nan], [-3]])


@pytest.mark.parametrize("with_pandas", [True, False])
def test_ordinal_encoder_encoded_missing_value_error(with_pandas):
"""Check OrdinalEncoder errors when encoded_missing_value is used by
an known category."""
X = np.array([["a", "dog"], ["b", "cat"], ["c", np.nan]], dtype=object)

# The 0-th feature has no missing values so it is not included in the list of
# features
error_msg = (
r"encoded_missing_value \(1\) is already used to encode a known category "
r"in features: "
)

if with_pandas:
pd = pytest.importorskip("pandas")
X = pd.DataFrame(X, columns=["letter", "pet"])
error_msg = error_msg + r"\['pet'\]"
else:
error_msg = error_msg + r"\[1\]"

oe = OrdinalEncoder(encoded_missing_value=1)

with pytest.raises(ValueError, match=error_msg):
oe.fit(X)
0