scikit-learn
diff --git a/‎doc/modules/preprocessing.rst
Lines changed: 11 additions & 0 deletions b/‎doc/modules/preprocessing.rst
Lines changed: 11 additions & 0 deletions
diff --git a/‎doc/whats_new/v1.0.rst
Lines changed: 6 additions & 0 deletions b/‎doc/whats_new/v1.0.rst
Lines changed: 6 additions & 0 deletions
diff --git a/‎sklearn/preprocessing/_encoders.py
Lines changed: 31 additions & 4 deletions b/‎sklearn/preprocessing/_encoders.py
Lines changed: 31 additions & 4 deletions
diff --git a/‎sklearn/preprocessing/tests/test_encoders.py
Lines changed: 119 additions & 18 deletions b/‎sklearn/preprocessing/tests/test_encoders.py
Lines changed: 119 additions & 18 deletions
@@ -482,6 +482,17 @@ scikit-learn estimators, as these expect continuous input, and would interpret
 the categories as being ordered, which is often not desired (i.e. the set of
 browsers was ordered arbitrarily).
 
+:class:`OrdinalEncoder` will also passthrough missing values that are
+indicated by `np.nan`.
+
+    >>> enc = preprocessing.OrdinalEncoder()
+    >>> X = [['male'], ['female'], [np.nan], ['female']]
+    >>> enc.fit_transform(X)
+    array([[ 1.],
+           [ 0.],
+           [nan],
+           [ 0.]])
+
 Another possibility to convert categorical features to features that can be used
 with scikit-learn estimators is to use a one-of-K, also known as one-hot or
 dummy encoding.
 
@@ -123,6 +123,12 @@ Changelog
   not corresponding to their objective. :pr:`19172` by
   :user:`Mathurin Massias <mathurinm>`
 
+:mod:`sklearn.preprocessing`
+............................
+
 - |Feature| :class:`preprocessing.OrdinalEncoder` supports passing through
+  missing values by default. :pr:`19069` by `Thomas Fan`_.
+
 - |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression`
   is deprecated and will be removed in 1.2.
   Motivation for this deprecation: ``normalize`` parameter did not take any
 
@@ -10,6 +10,7 @@
 from ..utils import check_array, is_scalar_nan
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _deprecate_positional_args
+from ..utils._mask import _get_mask
 
 from ..utils._encode import _encode, _check_unknown, _unique
 
@@ -752,7 +753,7 @@ def fit(self, X, y=None):
                 if np.dtype(self.dtype).kind != 'f':
                     raise ValueError(
                         f"When unknown_value is np.nan, the dtype "
-                        "parameter should be "
+                        f"parameter should be "
                         f"a float dtype. Got {self.dtype}."
                     )
             elif not isinstance(self.unknown_value, numbers.Integral):
@@ -765,7 +766,7 @@ def fit(self, X, y=None):
                             f"handle_unknown is 'use_encoded_value', "
                             f"got {self.unknown_value}.")
 
-        self._fit(X)
+        self._fit(X, force_all_finite='allow-nan')
 
         if self.handle_unknown == 'use_encoded_value':
             for feature_cats in self.categories_:
@@ -775,6 +776,21 @@ def fit(self, X, y=None):
                                      f"values already used for encoding the "
                                      f"seen categories.")
 
+        # stores the missing indices per category
+        self._missing_indices = {}
+        for cat_idx, categories_for_idx in enumerate(self.categories_):
+            for i, cat in enumerate(categories_for_idx):
+                if is_scalar_nan(cat):
+                    self._missing_indices[cat_idx] = i
+                    continue
+
+        if np.dtype(self.dtype).kind != 'f' and self._missing_indices:
+            raise ValueError(
+                "There are missing values in features "
+                f"{list(self._missing_indices)}. For OrdinalEncoder to "
+                "passthrough missing values, the dtype parameter must be a "
+                "float")
+
         return self
 
     def transform(self, X):
@@ -791,9 +807,14 @@ def transform(self, X):
         X_out : sparse matrix or a 2-d array
             Transformed input.
         """
-        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
+        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown,
+                                        force_all_finite='allow-nan')
         X_trans = X_int.astype(self.dtype, copy=False)
 
+        for cat_idx, missing_idx in self._missing_indices.items():
+            X_missing_mask = X_int[:, cat_idx] == missing_idx
+            X_trans[X_missing_mask, cat_idx] = np.nan
+
         # create separate category for unknown values
         if self.handle_unknown == 'use_encoded_value':
             X_trans[~X_mask] = self.unknown_value
@@ -814,7 +835,7 @@ def inverse_transform(self, X):
             Inverse transformed array.
         """
         check_is_fitted(self)
-        X = check_array(X, accept_sparse='csr')
+        X = check_array(X, accept_sparse='csr', force_all_finite='allow-nan')
 
         n_samples, _ = X.shape
         n_features = len(self.categories_)
@@ -833,6 +854,12 @@ def inverse_transform(self, X):
 
         for i in range(n_features):
             labels = X[:, i].astype('int64', copy=False)
+
+            # replace values of X[:, i] that were nan with actual indices
+            if i in self._missing_indices:
+                X_i_mask = _get_mask(X[:, i], np.nan)
+                labels[X_i_mask] = self._missing_indices[i]
+
             if self.handle_unknown == 'use_encoded_value':
                 unknown_labels = labels == self.unknown_value
                 X_tr[:, i] = self.categories_[i][np.where(
 
@@ -574,24 +574,6 @@ def test_ordinal_encoder_inverse():
         enc.inverse_transform(X_tr)
 
 
-@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
-                               np.array([['a', np.nan]], dtype=object).T],
-                         ids=['numeric', 'object'])
-def test_ordinal_encoder_raise_missing(X):
-    ohe = OrdinalEncoder()
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit(X)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit_transform(X)
-
-    ohe.fit(X[:1, :])
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.transform(X)
-
-
 def test_ordinal_encoder_handle_unknowns_string():
     enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-2)
     X_fit = np.array([['a', 'x'], ['b', 'y'], ['c', 'z']], dtype=object)
@@ -930,3 +912,122 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type):
     assert len(ohe.categories_) == 1
     assert_array_equal(ohe.categories_[0][:-1], ['a', 'b', 'c'])
     assert np.isnan(ohe.categories_[0][-1])
+
+
+def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
+    """Test ordinal encoder with nan passthrough fails when dtype=np.int32."""
+
+    X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T
+    oe = OrdinalEncoder(dtype=np.int32)
+
+    msg = (r"There are missing values in features \[0\]. For OrdinalEncoder "
+           "to passthrough missing values, the dtype parameter must be a "
+           "float")
+    with pytest.raises(ValueError, match=msg):
+        oe.fit(X)
+
+
+def test_ordinal_encoder_passthrough_missing_values_float():
+    """Test ordinal encoder with nan on float dtypes."""
+
+    X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
+    oe = OrdinalEncoder().fit(X)
+
+    assert len(oe.categories_) == 1
+    assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])
+
+    X_trans = oe.transform(X)
+    assert_allclose(X_trans, [[np.nan], [1.0], [0.0], [1.0]])
+
+    X_inverse = oe.inverse_transform(X_trans)
+    assert_allclose(X_inverse, X)
+
+
+@pytest.mark.parametrize('pd_nan_type', ['pd.NA', 'np.nan'])
+def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
+    """Check ordinal encoder is compatible with pandas."""
+    # checks pandas dataframe with categorical features
+    if pd_nan_type == 'pd.NA':
+        # pd.NA is in pandas 1.0
+        pd = pytest.importorskip('pandas', minversion="1.0")
+        pd_missing_value = pd.NA
+    else:  # np.nan
+        pd = pytest.importorskip('pandas')
+        pd_missing_value = np.nan
+
+    df = pd.DataFrame({
+        'col1': pd.Series(['c', 'a', pd_missing_value, 'b', 'a'],
+                          dtype='category'),
+    })
+
+    oe = OrdinalEncoder().fit(df)
+    assert len(oe.categories_) == 1
+    assert_array_equal(oe.categories_[0][:3], ['a', 'b', 'c'])
+    assert np.isnan(oe.categories_[0][-1])
+
+    df_trans = oe.transform(df)
+
+    assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]])
+
+    X_inverse = oe.inverse_transform(df_trans)
+    assert X_inverse.shape == (5, 1)
+    assert_array_equal(X_inverse[:2, 0], ['c', 'a'])
+    assert_array_equal(X_inverse[3:, 0], ['b', 'a'])
+    assert np.isnan(X_inverse[2, 0])
+
+
+@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
+    ((np.array([['a', np.nan]], dtype=object).T,
+      np.array([['a', 'b']], dtype=object).T,
+     [np.array(['a', np.nan, 'd'], dtype=object)], np.object_)),
+    ((np.array([['a', np.nan]], dtype=object).T,
+      np.array([['a', 'b']], dtype=object).T,
+     [np.array(['a', np.nan, 'd'], dtype=object)], np.object_)),
+    ((np.array([[2.0, np.nan]], dtype=np.float64).T,
+      np.array([[3.0]], dtype=np.float64).T,
+     [np.array([2.0, 4.0, np.nan])], np.float64)),
+    ], ids=['object-None-missing-value', 'object-nan-missing_value',
+            'numeric-missing-value'])
+def test_ordinal_encoder_specified_categories_missing_passthrough(
+        X, X2, cats, cat_dtype):
+    """Test ordinal encoder for specified categories."""
+    oe = OrdinalEncoder(categories=cats)
+    exp = np.array([[0.], [np.nan]])
+    assert_array_equal(oe.fit_transform(X), exp)
+    # manually specified categories should have same dtype as
+    # the data when coerced from lists
+    assert oe.categories_[0].dtype == cat_dtype
+
+    # when specifying categories manually, unknown categories should already
+    # raise when fitting
+    oe = OrdinalEncoder(categories=cats)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        oe.fit(X2)
+
+
+@pytest.mark.parametrize("X, expected_X_trans, X_test", [
+    (np.array([[1.0, np.nan, 3.0]]).T,
+     np.array([[0.0, np.nan, 1.0]]).T,
+     np.array([[4.0]])),
+    (np.array([[1.0, 4.0, 3.0]]).T,
+     np.array([[0.0, 2.0, 1.0]]).T,
+     np.array([[np.nan]])),
+    (np.array([['c', np.nan, 'b']], dtype=object).T,
+     np.array([[1.0, np.nan, 0.0]]).T,
+     np.array([['d']], dtype=object)),
+    (np.array([['c', 'a', 'b']], dtype=object).T,
+     np.array([[2.0, 0.0, 1.0]]).T,
+     np.array([[np.nan]], dtype=object)),
+])
+def test_ordinal_encoder_handle_missing_and_unknown(
+        X, expected_X_trans, X_test
+):
+    """Test the interaction between missing values and handle_unknown"""
+
+    oe = OrdinalEncoder(handle_unknown="use_encoded_value",
+                        unknown_value=-1)
+
+    X_trans = oe.fit_transform(X)
+    assert_allclose(X_trans, expected_X_trans)
+
+    assert_allclose(oe.transform(X_test), [[-1.0]])