scikit-learn · jeremiedbb · Mar 23, 2022 · Dec 15, 2021 · Dec 15, 2021 · Dec 15, 2021
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -537,8 +537,8 @@ scikit-learn estimators, as these expect continuous input, and would interpret
 the categories as being ordered, which is often not desired (i.e. the set of
 browsers was ordered arbitrarily).
 
-:class:`OrdinalEncoder` will also passthrough missing values that are
-indicated by `np.nan`.
+By default, :class:`OrdinalEncoder` will also passthrough missing values that
+are indicated by `np.nan`.
 
     >>> enc = preprocessing.OrdinalEncoder()
     >>> X = [['male'], ['female'], [np.nan], ['female']]
@@ -548,6 +548,32 @@ indicated by `np.nan`.
            [nan],
            [ 0.]])
 
+:class:`OrdinalEncoder` provides a parameter `encoded_missing_value` to encode
+the missing values without the need to create a pipeline and using
+:class:`~sklearn.impute.SimpleImputer`.
+
+    >>> enc = preprocessing.OrdinalEncoder(encoded_missing_value=-1)
+    >>> X = [['male'], ['female'], [np.nan], ['female']]
+    >>> enc.fit_transform(X)
+    array([[ 1.],
+           [ 0.],
+           [-1.],
+           [ 0.]])
+
+The above processing is equivalent to the following pipeline::
+
+    >>> from sklearn.pipeline import Pipeline
+    >>> from sklearn.impute import SimpleImputer
+    >>> enc = Pipeline(steps=[
+    ...     ("encoder", preprocessing.OrdinalEncoder()),
+    ...     ("imputer", SimpleImputer(strategy="constant", fill_value=-1)),
+    ... ])
+    >>> enc.fit_transform(X)
+    array([[ 1.],
+           [ 0.],
+           [-1.],
+           [ 0.]])
+
 Another possibility to convert categorical features to features that can be used
 with scikit-learn estimators is to use a one-of-K, also known as one-hot or
 dummy encoding.

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -803,6 +803,9 @@ Changelog
   the model. The option is only available when `strategy` is set to `quantile`.
   :pr:`21445` by :user:`Felipe Bidu <fbidu>` and :user:`Amanda Dsouza <amy12xx>`.
 
+- |Enhancement| Adds `encoded_missing_value` to :class:`preprocessing.OrdinalEncoder`
+  to configure the encoded value for missing data. :pr:`21988` by `Thomas Fan`_.
+
 - |Enhancement| Added the `get_feature_names_out` method and a new parameter
   `feature_names_out` to :class:`preprocessing.FunctionTransformer`. You can set
   `feature_names_out` to 'one-to-one' to use the input features names as the

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -1160,6 +1160,12 @@ class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder):
 
         .. versionadded:: 0.24
 
+    encoded_missing_value : int or np.nan, default=np.nan
+        Encoded value of missing categories. If set to `np.nan`, then the `dtype`
+        parameter must be a float dtype.
+
+        .. versionadded:: 1.1
+
     Attributes
     ----------
     categories_ : list of arrays
@@ -1203,6 +1209,23 @@ class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder):
     >>> enc.inverse_transform([[1, 0], [0, 1]])
     array([['Male', 1],
            ['Female', 2]], dtype=object)
+
+    By default, :class:`OrdinalEncoder` is lenient towards missing values by
+    propagating them.
+
+    >>> import numpy as np
+    >>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]]
+    >>> enc.fit_transform(X)
+    array([[ 1.,  0.],
+           [ 0.,  1.],
+           [ 0., nan]])
+
+    You can use the parameter `encoded_missing_value` to encode missing values.
+
+    >>> enc.set_params(encoded_missing_value=-1).fit_transform(X)
+    array([[ 1.,  0.],
+           [ 0.,  1.],
+           [ 0., -1.]])
     """
 
     def __init__(
@@ -1212,11 +1235,13 @@ def __init__(
         dtype=np.float64,
         handle_unknown="error",
         unknown_value=None,
+        encoded_missing_value=np.nan,
     ):
         self.categories = categories
         self.dtype = dtype
         self.handle_unknown = handle_unknown
         self.unknown_value = unknown_value
+        self.encoded_missing_value = encoded_missing_value
 
     def fit(self, X, y=None):
         """
@@ -1286,13 +1311,38 @@ def fit(self, X, y=None):
                     self._missing_indices[cat_idx] = i
                     continue
 
-        if np.dtype(self.dtype).kind != "f" and self._missing_indices:
-            raise ValueError(
-                "There are missing values in features "
-                f"{list(self._missing_indices)}. For OrdinalEncoder to "
-                "passthrough missing values, the dtype parameter must be a "
-                "float"
-            )
+        if self._missing_indices:
+            if np.dtype(self.dtype).kind != "f" and is_scalar_nan(
+                self.encoded_missing_value
+            ):
+                raise ValueError(
+                    "There are missing values in features "
+                    f"{list(self._missing_indices)}. For OrdinalEncoder to "
+                    f"encode missing values with dtype: {self.dtype}, set "
+                    "encoded_missing_value to a non-nan value, or "
+                    "set dtype to a float"
+                )
+
+            if not is_scalar_nan(self.encoded_missing_value):
+                # Features are invalid when they contain a missing category
+                # and encoded_missing_value was already used to encode a
+                # known category
+                invalid_features = [
+                    cat_idx
+                    for cat_idx, categories_for_idx in enumerate(self.categories_)
+                    if cat_idx in self._missing_indices
+                    and 0 <= self.encoded_missing_value < len(categories_for_idx)
+                ]
+
+                if invalid_features:
+                    # Use feature names if they are avaliable
+                    if hasattr(self, "feature_names_in_"):
+                        invalid_features = self.feature_names_in_[invalid_features]
+                    raise ValueError(
+                        f"encoded_missing_value ({self.encoded_missing_value}) "
+                        "is already used to encode a known category in features: "
+                        f"{invalid_features}"
+                    )
 
         return self
 
@@ -1317,7 +1367,7 @@ def transform(self, X):
 
         for cat_idx, missing_idx in self._missing_indices.items():
             X_missing_mask = X_int[:, cat_idx] == missing_idx
-            X_trans[X_missing_mask, cat_idx] = np.nan
+            X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value
 
         # create separate category for unknown values
         if self.handle_unknown == "use_encoded_value":
@@ -1362,7 +1412,7 @@ def inverse_transform(self, X):
 
             # replace values of X[:, i] that were nan with actual indices
             if i in self._missing_indices:
-                X_i_mask = _get_mask(X[:, i], np.nan)
+                X_i_mask = _get_mask(X[:, i], self.encoded_missing_value)
                 labels[X_i_mask] = self._missing_indices[i]
 
             if self.handle_unknown == "use_encoded_value":

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -1664,31 +1664,35 @@ def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
 
     msg = (
         r"There are missing values in features \[0\]. For OrdinalEncoder "
-        "to passthrough missing values, the dtype parameter must be a "
-        "float"
+        f"to encode missing values with dtype: {np.int32}"
     )
     with pytest.raises(ValueError, match=msg):
         oe.fit(X)
 
 
-def test_ordinal_encoder_passthrough_missing_values_float():
+@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
+def test_ordinal_encoder_passthrough_missing_values_float(encoded_missing_value):
     """Test ordinal encoder with nan on float dtypes."""
 
     X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
-    oe = OrdinalEncoder().fit(X)
+    oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(X)
 
     assert len(oe.categories_) == 1
+
     assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])
 
     X_trans = oe.transform(X)
-    assert_allclose(X_trans, [[np.nan], [1.0], [0.0], [1.0]])
+    assert_allclose(X_trans, [[encoded_missing_value], [1.0], [0.0], [1.0]])
 
     X_inverse = oe.inverse_transform(X_trans)
     assert_allclose(X_inverse, X)
 
 
 @pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
-def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
+@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
+def test_ordinal_encoder_missing_value_support_pandas_categorical(
+    pd_nan_type, encoded_missing_value
+):
     """Check ordinal encoder is compatible with pandas."""
     # checks pandas dataframe with categorical features
     pd = pytest.importorskip("pandas")
@@ -1701,14 +1705,14 @@ def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
         }
     )
 
-    oe = OrdinalEncoder().fit(df)
+    oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(df)
     assert len(oe.categories_) == 1
     assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"])
     assert np.isnan(oe.categories_[0][-1])
 
     df_trans = oe.transform(df)
 
-    assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]])
+    assert_allclose(df_trans, [[2.0], [0.0], [encoded_missing_value], [1.0], [0.0]])
 
     X_inverse = oe.inverse_transform(df_trans)
     assert X_inverse.shape == (5, 1)
@@ -1902,3 +1906,50 @@ def test_ordinal_encoder_features_names_out_pandas():
 
     feature_names_out = enc.get_feature_names_out()
     assert_array_equal(names, feature_names_out)
+
+
+def test_ordinal_encoder_unknown_missing_interaction():
+    """Check interactions between encode_unknown and missing value encoding."""
+
+    X = np.array([["a"], ["b"], [np.nan]], dtype=object)
+
+    oe = OrdinalEncoder(
+        handle_unknown="use_encoded_value",
+        unknown_value=np.nan,
+        encoded_missing_value=-3,
+    ).fit(X)
+
+    X_trans = oe.transform(X)
+    assert_allclose(X_trans, [[0], [1], [-3]])
+
+    # "c" is unknown and is mapped to np.nan
+    # "None" is a missing value and is set to -3
+    X_test = np.array([["c"], [np.nan]], dtype=object)
+    X_test_trans = oe.transform(X_test)
+    assert_allclose(X_test_trans, [[np.nan], [-3]])
+
+
+@pytest.mark.parametrize("with_pandas", [True, False])
+def test_ordinal_encoder_encoded_missing_value_error(with_pandas):
+    """Check OrdinalEncoder errors when encoded_missing_value is used by
+    an known category."""
+    X = np.array([["a", "dog"], ["b", "cat"], ["c", np.nan]], dtype=object)
+
+    # The 0-th feature has no missing values so it is not included in the list of
+    # features
+    error_msg = (
+        r"encoded_missing_value \(1\) is already used to encode a known category "
+        r"in features: "
+    )
+
+    if with_pandas:
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame(X, columns=["letter", "pet"])
+        error_msg = error_msg + r"\['pet'\]"
+    else:
+        error_msg = error_msg + r"\[1\]"
+
+    oe = OrdinalEncoder(encoded_missing_value=1)
+
+    with pytest.raises(ValueError, match=error_msg):
+        oe.fit(X)