8000 ENH Adds encoded_missing_value to OrdinalEncoder by thomasjpfan · Pull Request #21988 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH Adds encoded_missing_value to OrdinalEncoder #21988

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 23, 2022
30 changes: 28 additions & 2 deletions doc/modules/preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -537,8 +537,8 @@ scikit-learn estimators, as these expect continuous input, and would interpret
the categories as being ordered, which is often not desired (i.e. the set of
browsers was ordered arbitrarily).

:class:`OrdinalEncoder` will also passthrough missing values that are
indicated by `np.nan`.
By default, :class:`OrdinalEncoder` will also passthrough missing values that
are indicated by `np.nan`.

>>> enc = preprocessing.OrdinalEncoder()
>>> X = [['male'], ['female'], [np.nan], ['female']]
Expand All @@ -548,6 +548,32 @@ indicated by `np.nan`.
[nan],
[ 0.]])

:class:`OrdinalEncoder` provides a parameter `encoded_missing_value` to encode
the missing values without the need to create a pipeline and using
:class:`~sklearn.impute.SimpleImputer`.

>>> enc = preprocessing.OrdinalEncoder(encoded_missing_value=-1)
>>> X = [['male'], ['female'], [np.nan], ['female']]
>>> enc.fit_transform(X)
array([[ 1.],
[ 0.],
[-1.],
[ 0.]])

The above processing is equivalent to the following pipeline::

>>> from sklearn.pipeline import Pipeline
>>> from sklearn.impute import SimpleImputer
>>> enc = Pipeline(steps=[
... ("encoder", preprocessing.OrdinalEncoder()),
... ("imputer", SimpleImputer(strategy="constant", fill_value=-1)),
... ])
>>> enc.fit_transform(X)
array([[ 1.],
[ 0.],
[-1.],
[ 0.]])

Another possibility to convert categorical features to features that can be used
with scikit-learn estimators is to use a one-of-K, also known as one-hot or
dummy encoding.
Expand Down
3 changes: 3 additions & 0 deletions doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,9 @@ Changelog
the model. The option is only available when `strategy` is set to `quantile`.
:pr:`21445` by :user:`Felipe Bidu <fbidu>` and :user:`Amanda Dsouza <amy12xx>`.

- |Enhancement| Adds `encoded_missing_value` to :class:`preprocessing.OrdinalEncoder`
to configure the encoded value for missing data. :pr:`21988` by `Thomas Fan`_.

- |Enhancement| Added the `get_feature_names_out` method and a new parameter
`feature_names_out` to :class:`preprocessing.FunctionTransformer`. You can set
`feature_names_out` to 'one-to-one' to use the input features names as the
Expand Down
68 changes: 59 additions & 9 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,12 @@ class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder):

.. versionadded:: 0.24

encoded_missing_value : int or np.nan, default=np.nan
Encoded value of missing categories. If set to `np.nan`, then the `dtype`
parameter must be a float dtype.

.. versionadded:: 1.1

Attributes
----------
categories_ : list of arrays
Expand Down Expand Up @@ -1203,6 +1209,23 @@ class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder):
>>> enc.inverse_transform([[1, 0], [0, 1]])
array([['Male', 1],
['Female', 2]], dtype=object)

By default, :class:`OrdinalEncoder` is lenient towards missing values by
propagating them.

>>> import numpy as np
>>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]]
>>> enc.fit_transform(X)
array([[ 1., 0.],
[ 0., 1.],
[ 0., nan]])

You can use the parameter `encoded_missing_value` to encode missing values.

>>> enc.set_params(encoded_missing_value=-1).fit_transform(X)
array([[ 1., 0.],
[ 0., 1.],
[ 0., -1.]])
"""

def __init__(
Expand All @@ -1212,11 +1235,13 @@ def __init__(
dtype=np.float64,
handle_unknown="error",
unknown_value=None,
encoded_missing_value=np.nan,
):
self.categories = categories
self.dtype = dtype
self.handle_unknown = handle_unknown
self.unknown_value = unknown_value
self.encoded_missing_value = encoded_missing_value

def fit(self, X, y=None):
"""
Expand Down Expand Up @@ -1286,13 +1311,38 @@ def fit(self, X, y=None):
self._missing_indices[cat_idx] = i
continue

if np.dtype(self.dtype).kind != "f" and self._missing_indices:
raise ValueError(
"There are missing values in features "
f"{list(self._missing_indices)}. For OrdinalEncoder to "
"passthrough missing values, the dtype parameter must be a "
"float"
)
if self._missing_indices:
if np.dtype(self.dtype).kind != "f" and is_scalar_nan(
self.encoded_missing_value
):
raise ValueError(
"There are missing values in features "
f"{list(self._missing_indices)}. For OrdinalEncoder to "
f"encode missing values with dtype: {self.dtype}, set "
"encoded_missing_value to a non-nan value, or "
"set dtype to a float"
)

if not is_scalar_nan(self.encoded_missing_value):
# Features are invalid when they contain a missing category
# and encoded_missing_value was already used to encode a
# known category
invalid_features = [
cat_idx
for cat_idx, categories_for_idx in enumerate(self.categories_)
if cat_idx in self._missing_indices
and 0 <= self.encoded_missing_value < len(categories_for_idx)
]

if invalid_features:
# Use feature names if they are avaliable
if hasattr(self, "feature_names_in_"):
invalid_features = self.feature_names_in_[invalid_features]
raise ValueError(
f"encoded_missing_value ({self.encoded_missing_value}) "
"is already used to encode a known category in features: "
f"{invalid_features}"
)

return self

Expand All @@ -1317,7 +1367,7 @@ def transform(self, X):

for cat_idx, missing_idx in self._missing_indices.items():
X_missing_mask = X_int[:, cat_idx] == missing_idx
X_trans[X_missing_mask, cat_idx] = np.nan
X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value

# create separate category for unknown values
if self.handle_unknown == "use_encoded_value":
Expand Down Expand Up @@ -1362,7 +1412,7 @@ def inverse_transform(self, X):

# replace values of X[:, i] that were nan with actual indices
if i in self._missing_indices:
X_i_mask = _get_mask(X[:, i], np.nan)
X_i_mask = _get_mask(X[:, i], self.encoded_missing_value)
labels[X_i_mask] = self._missing_indices[i]

if self.handle_unknown == "use_encoded_value":
Expand Down
67 changes: 59 additions & 8 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1664,31 +1664,35 @@ def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():

msg = (
r"There are missing values in features \[0\]. For OrdinalEncoder "
"to passthrough missing values, the dtype parameter must be a "
"float"
f"to encode missing values with dtype: {np.int32}"
)
with pytest.raises(ValueError, match=msg):
oe.fit(X)


def test_ordinal_encoder_passthrough_missing_values_float():
@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
def test_ordinal_encoder_passthrough_missing_values_float(encoded_missing_value):
"""Test ordinal encoder with nan on float dtypes."""

X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
oe = OrdinalEncoder().fit(X)
oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(X)

assert len(oe.categories_) == 1

assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])

X_trans = oe.transform(X)
assert_allclose(X_trans, [[np.nan], [1.0], [0.0], [1.0]])
assert_allclose(X_trans, [[encoded_missing_value], [1.0], [0.0], [1.0]])

X_inverse = oe.inverse_transform(X_trans)
assert_allclose(X_inverse, X)


@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
def test_ordinal_encoder_missing_value_support_pandas_categorical(
pd_nan_type, encoded_missing_value
):
"""Check ordinal encoder is compatible with pandas."""
# checks pandas dataframe with categorical features
pd = pytest.importorskip("pandas")
Expand All @@ -1701,14 +1705,14 @@ def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
}
)

oe = OrdinalEncoder().fit(df)
oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(df)
assert len(oe.categories_) == 1
assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"])
assert np.isnan(oe.categories_[0][-1])

df_trans = oe.transform(df)

assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]])
assert_allclose(df_trans, [[2.0], [0.0], [encoded_missing_value], [1.0], [0.0]])

X_inverse = oe.inverse_transform(df_trans)
assert X_inverse.shape == (5, 1)
Expand Down Expand Up @@ -1902,3 +1906,50 @@ def test_ordinal_encoder_features_names_out_pandas():

feature_names_out = enc.get_feature_names_out()
assert_array_equal(names, feature_names_out)


def test_ordinal_encoder_unknown_missing_interaction():
"""Check interactions between encode_unknown and missing value encoding."""

X = np.array([["a"], ["b"], [np.nan]], dtype=object)

oe = OrdinalEncoder(
handle_unknown="use_encoded_value",
unknown_value=np.nan,
encoded_missing_value=-3,
).fit(X)

X_trans = oe.transform(X)
assert_allclose(X_trans, [[0], [1], [-3]])

# "c" is unknown and is mapped to np.nan
# "None" is a missing value and is set to -3
X_test = np.array([["c"], [np.nan]], dtype=object)
X_test_trans = oe.transform(X_test)
assert_allclose(X_test_trans, [[np.nan], [-3]])


@pytest.mark.parametrize("with_pandas", [True, False])
def test_ordinal_encoder_encoded_missing_value_error(with_pandas):
"""Check OrdinalEncoder errors when encoded_missing_value is used by
an known category."""
X = np.array([["a", "dog"], ["b", "cat"], ["c", np.nan]], dtype=object)

# The 0-th feature has no missing values so it is not included in the list of
# features
error_msg = (
r"encoded_missing_value \(1\) is already used to encode a known category "
r"in features: "
)

if with_pandas:
pd = pytest.importorskip("pandas")
X = pd.DataFrame(X, columns=["letter", "pet"])
error_msg = error_msg + r"\['pet'\]"
else:
error_msg = error_msg + r"\[1\]"

oe = OrdinalEncoder(encoded_missing_value=1)

with pytest.raises(ValueError, match=error_msg):
oe.fit(X)
0