scikit-learn
diff --git a/‎sklearn/preprocessing/_encoders.py
Lines changed: 27 additions & 19 deletions b/‎sklearn/preprocessing/_encoders.py
Lines changed: 27 additions & 19 deletions
diff --git a/‎sklearn/preprocessing/_label.py
Lines changed: 6 additions & 4 deletions b/‎sklearn/preprocessing/_label.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎sklearn/preprocessing/tests/test_encoders.py
Lines changed: 22 additions & 21 deletions b/‎sklearn/preprocessing/tests/test_encoders.py
Lines changed: 22 additions & 21 deletions
@@ -2,18 +2,15 @@
 #          Joris Van den Bossche <jorisvandenbossche@gmail.com>
 # License: BSD 3 clause
 
-import warnings
-
 import numpy as np
 from scipy import sparse
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
-from ..utils.validation import check_is_fitted
 from ..utils.fixes import _object_dtype_isnan
+from ..utils.validation import check_is_fitted
 from ._label import _encode, _encode_check_unknown
 
-
 __all__ = [
     'OneHotEncoder',
     'OrdinalEncoder'
@@ -93,19 +90,20 @@ def _fit(self, X, handle_unknown='error'):
                                          "supported for numerical categories")
                 if handle_unknown == 'error':
                     # NaNs don't count as categoreis during fit
-                    diff = _encode_check_unknown(Xi[~_object_dtype_isnan(Xi)], cats)
+                    diff = _encode_check_unknown(
+                        Xi[~_object_dtype_isnan(Xi)], cats)
                     if diff:
                         msg = ("Found unknown categories {0} in column {1}"
                                " during fit".format(diff, i))
                         raise ValueError(msg)
             self.categories_.append(cats)
 
     def _transform(self, X, handle_unknown='error', handle_missing=None):
-        X_list, n_samples, n_features = self._check_X(
-            X)
-        # from now on, either X is w.o. NaNs or w. NaNs yet handle_missing != None. 
-        # in the later case, since we'll handle NaNs separately, 
-        # NaNs don't count as unknown categories
+        X_list, n_samples, n_features = self._check_X(X)
+        # from now on, either X is w.o. NaNs
+        #              or w. NaNs yet handle_missing != None.
+        # since we'll handle NaNs separately so that it does not intefere
+        # with handle_unknown, we won't count NaNs as unknown categories
         X_int = np.zeros((n_samples, n_features), dtype=np.int)
         X_mask = np.ones((n_samples, n_features), dtype=np.bool)
 
@@ -137,7 +135,7 @@ def _transform(self, X, handle_unknown='error', handle_missing=None):
                         Xi = Xi.astype(self.categories_[i].dtype)
                     else:
                         Xi = Xi.copy()
-                    
+
                     if handle_missing == 'indicator':
                         valid_mask = na_valid_mask
                     Xi[~valid_mask] = self.categories_[i][0]
@@ -151,6 +149,11 @@ def _transform(self, X, handle_unknown='error', handle_missing=None):
                                  check_unknown=False)
             X_int[:, i] = encoded
 
+            if (self.handle_missing == 'indicator' and
+                    _object_dtype_isnan(Xi).sum() > 0):
+                self.categories_[i] = np.append(
+                    np.array(self.categories_[i], dtype=object), None)
+
         return X_int, X_mask
 
     def _more_tags(self):
@@ -230,7 +233,8 @@ class OneHotEncoder(_BaseEncoder):
         will be denoted as None.
 
     handle_missing : {'indicator', 'all-zero'}, default=None
-        Specify how to handle missing categorical features (NaN) in the training data 
+        Specify how to handle missing categorical features (NaN) in the
+        training data
 
         - None : Raise an error in the presence of NaN (the default).
         - 'indicator': Represent with a separate one-hot column.
@@ -310,7 +314,8 @@ class OneHotEncoder(_BaseEncoder):
     """
 
     def __init__(self, categories='auto', drop=None, sparse=True,
-                 dtype=np.float64, handle_unknown='error', handle_missing=None):
+                 dtype=np.float64, handle_unknown='error',
+                 handle_missing=None):
         self.categories = categories
         self.sparse = sparse
         self.dtype = dtype
@@ -340,8 +345,9 @@ def _compute_drop_idx(self):
             if self.drop == 'first':
                 return np.zeros(len(self.categories_), dtype=np.object)
             elif self.drop == 'if_binary':
-                return np.array([0 if len(cats) == 2 else None
-                                 for cats in self.categories_], dtype=np.object)
+                return np.array(
+                    [0 if len(cats) == 2 else None
+                     for cats in self.categories_], dtype=np.object)
             else:
                 msg = (
                     "Wrong input for parameter `drop`. Expected "
@@ -441,7 +447,8 @@ def transform(self, X):
         check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
         X_int, X_mask = self._transform(
-            X, handle_unknown=self.handle_unknown, handle_missing=self.handle_missing)
+            X, handle_unknown=self.handle_unknown,
+            handle_missing=self.handle_missing)
 
         n_samples, n_features = X_int.shape
 
@@ -486,7 +493,6 @@ def transform(self, X):
         else:
             return out
 
-
     def inverse_transform(self, X):
         """
         Convert the data back to the original representation.
@@ -549,7 +555,8 @@ def inverse_transform(self, X):
             # for sparse X argmax returns 2D matrix, ensure 1D array
             labels = np.asarray(sub.argmax(axis=1)).flatten()
             X_tr[:, i] = cats[labels]
-            if self.handle_unknown == 'ignore' or self.handle_missing == 'all-zero':
+            if (self.handle_unknown == 'ignore' or
+                    self.handle_missing == 'all-zero'):
                 unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
                 # ignored unknown categories: we have a row of all zero
                 if unknown.any():
@@ -674,7 +681,8 @@ class OrdinalEncoder(_BaseEncoder):
            ['Female', 2]], dtype=object)
     """
 
-    def __init__(self, categories='auto', dtype=np.float64, handle_missing=None):
+    def __init__(self, categories='auto', dtype=np.float64,
+                 handle_missing=None):
         self.categories = categories
         self.dtype = dtype
         self.handle_missing = handle_missing
 
@@ -67,10 +67,11 @@ def _encode_python(values, uniques=None, encode=False, check_unknown=True):
                 encoded = np.array([table[v] for v in values])
             except KeyError as e:
                 raise ValueError("y contains previously unseen labels: %s"
-                                % str(e))
+                                 % str(e))
         else:
-            encoded = np.array([table[v] if v in table else n_uniques for v in values ])
-        
+            encoded = np.array(
+                [table[v] if v in table else n_uniques for v in values])
+
         return uniques, encoded
     else:
         return uniques
@@ -114,7 +115,8 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
     """
     if values.dtype == object:
         try:
-            res = _encode_python(values, uniques, encode, check_unknown=check_unknown)
+            res = _encode_python(values, uniques, encode,
+                                 check_unknown=check_unknown)
         except TypeError:
             types = sorted(t.__qualname__
                            for t in set(type(v) for v in values))
 
@@ -462,8 +462,8 @@ def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
 
     ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
 
-    # with pytest.raises(ValueError, match="Input contains NaN"):
-    #     ohe.fit(X)
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit(X)
 
     with pytest.raises(ValueError, match="Input contains NaN"):
         ohe.fit_transform(X)
@@ -480,7 +480,8 @@ def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
 
 
 @pytest.mark.parametrize("X", [np.array([[1, 2, np.nan, 2]]).T,
-                               np.array([['a', 'b', np.nan, 'b']], dtype=object).T],
+                               np.array([['a', 'b', np.nan, 'b']],
+                                        dtype=object).T],
                          ids=['numeric', 'object'])
 @pytest.mark.parametrize("as_data_frame", [False, True],
                          ids=['array', 'dataframe'])
@@ -490,14 +491,17 @@ def test_one_hot_encoder_handle_missing(X, as_data_frame, handle_unknown):
         pd = pytest.importorskip('pandas')
         X = pd.DataFrame(X)
 
-    # enc_ind = OneHotEncoder(
-    #     categories='auto', sparse=False, handle_missing='indicator')
-    # exp_ind = np.array([[1,   0,  0],
-    #                     [0,   1,  0],
-    #                     [0,   0,  1],
-    #                     [0,   1,  0]], dtype='int64')
-    # print(enc_ind.fit_transform(X))
-    # assert_array_equal(enc_ind.fit_transform(X), exp_ind.astype('float64'))
+    X_inv = np.array(X, dtype=object)
+    X_inv[2, 0] = None
+
+    enc_ind = OneHotEncoder(
+        categories='auto', sparse=False, handle_missing='indicator')
+    exp_ind = np.array([[1,   0,  0],
+                        [0,   1,  0],
+                        [0,   0,  1],
+                        [0,   1,  0]], dtype='int64')
+    assert_array_equal(enc_ind.fit_transform(X), exp_ind.astype('float64'))
+    assert_array_equal(enc_ind.inverse_transform(exp_ind), X_inv)
 
     enc_zero = OneHotEncoder(
         categories='auto', sparse=False, handle_missing='all-zero')
@@ -506,9 +510,6 @@ def test_one_hot_encoder_handle_missing(X, as_data_frame, handle_unknown):
                          [0,   0],
                          [0,   1]], dtype='int64')
     assert_array_equal(enc_zero.fit_transform(X), exp_zero.astype('float64'))
-
-    X_inv = np.array(X, dtype=object)
-    X_inv[2, 0] = None
     assert_array_equal(enc_zero.inverse_transform(exp_zero), X_inv)
 
 
@@ -574
93C6
,8 +575,8 @@ def test_ordinal_encoder_inverse():
 def test_ordinal_encoder_raise_missing(X):
     ohe = OrdinalEncoder()
 
-    # with pytest.raises(ValueError, match="Input contains NaN"):
-    #     ohe.fit(X)
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit(X)
 
     with pytest.raises(ValueError, match="Input contains NaN"):
         ohe.fit_transform(X)
@@ -670,15 +671,15 @@ def test_one_hot_encoder_drop_manual():
 @pytest.mark.parametrize(
     "X_fit, params, err_msg",
     [([["Male"], ["Female"]], {'drop': 'second'},
-     "Wrong input for parameter `drop`"),
+      "Wrong input for parameter `drop`"),
      ([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
-     "`handle_unknown` must be 'error'"),
+      "`handle_unknown` must be 'error'"),
      ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
       {'drop': np.asarray('b', dtype=object)},
-     "Wrong input for parameter `drop`"),
+      "Wrong input for parameter `drop`"),
      ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
       {'drop': ['ghi', 3, 59]},
-     "The following categories were supposed")]
+      "The following categories were supposed")]
 )
 def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
     enc = OneHotEncoder(**params)
@@ -728,4 +729,4 @@ def test_encoders_does_not_support_none_values(Encoder):
     values = [["a"], [None]]
     with pytest.raises(TypeError, match="Encoders require their input to be "
                                         "uniformly strings or numbers."):
-        Encoder().fit_transform(values)
+        Encoder().fit(values)