thomasjpfan
diff --git a/‎doc/modules/preprocessing.rst
Lines changed: 29 additions & 1 deletion b/‎doc/modules/preprocessing.rst
Lines changed: 29 additions & 1 deletion
diff --git a/‎doc/whats_new/v0.24.rst
Lines changed: 3 additions & 0 deletions b/‎doc/whats_new/v0.24.rst
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/compose/plot_column_transformer_mixed_types.py
Lines changed: 1 addition & 3 deletions b/‎examples/compose/plot_column_transformer_mixed_types.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎examples/inspection/plot_permutation_importance.py
Lines changed: 3 additions & 7 deletions b/‎examples/inspection/plot_permutation_importance.py
Lines changed: 3 additions & 7 deletions
diff --git a/‎sklearn/preprocessing/_encoders.py
Lines changed: 47 additions & 19 deletions b/‎sklearn/preprocessing/_encoders.py
Lines changed: 47 additions & 19 deletions
@@ -590,6 +590,34 @@ In the transformed `X`, the first column is the encoding of the feature with
 categories "male"/"female", while the remaining 6 columns is the encoding of
 the 2 features with respectively 3 categories each.
 
+:class:`OneHotEncoder` supports categorical features with missing values by
+considering the missing values as an additional category::
+
+    >>> X = [['male', 'Safari'],
+    ...      ['female', None],
+    ...      [np.nan, 'Firefox']]
+    >>> enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X)
+    >>> enc.categories_
+    [array(['female', 'male', nan], dtype=object),
+     array(['Firefox', 'Safari', None], dtype=object)]
+    >>> enc.transform(X).toarray()
+    array([[0., 1., 0., 0., 1., 0.],
+           [1., 0., 0., 0., 0., 1.],
+           [0., 0., 1., 1., 0., 0.]])
+
+If a feature contains both `np.nan` and `None`, they will be considered
+separate categories::
+
+    >>> X = [['Safari'], [None], [np.nan], ['Firefox']]
+    >>> enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X)
+    >>> enc.categories_
+    [array(['Firefox', 'Safari', None, nan], dtype=object)]
+    >>> enc.transform(X).toarray()
+    array([[0., 1., 0., 0.],
+           [0., 0., 1., 0.],
+           [0., 0., 0., 1.],
+           [1., 0., 0., 0.]])
+
 See :ref:`dict_feature_extraction` for categorical features that are
 represented as a dict, not as scalars.
 
@@ -791,5 +819,5 @@ error with a ``filterwarnings``::
   ...                         category=UserWarning, append=False)
 
 For a full code example that demonstrates using a :class:`FunctionTransformer`
-to extract features from text data see 
+to extract features from text data see
 :ref:`sphx_glr_auto_examples_compose_plot_column_transformer.py`
@@ -552,6 +552,9 @@ Changelog
 :mod:`sklearn.preprocessing`
 ............................
 
+- |Feature| :class:`preprocessing.OneHotEncoder` now supports missing
+  values by treating them as a category. :pr:`17317` by `Thomas Fan`_.
+
 - |Feature| Add a new ``handle_unknown`` parameter with a
   ``use_encoded_value`` option, along with a new ``unknown_value`` parameter,
   to :class:`preprocessing.OrdinalEncoder` to allow unknown categories during
 
@@ -72,9 +72,7 @@
     ('scaler', StandardScaler())])
 
 categorical_features = ['embarked', 'sex', 'pclass']
-categorical_transformer = Pipeline(steps=[
-    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
-    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
+categorical_transformer = OneHotEncoder(handle_unknown='ignore')
 
 preprocessor = ColumnTransformer(
     transformers=[
 
@@ -63,16 +63,13 @@
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, stratify=y, random_state=42)
 
-categorical_pipe = Pipeline([
-    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
-    ('onehot', OneHotEncoder(handle_unknown='ignore'))
-])
+categorical_encoder = OneHotEncoder(handle_unknown='ignore')
 numerical_pipe = Pipeline([
     ('imputer', SimpleImputer(strategy='mean'))
 ])
 
 preprocessing = ColumnTransformer(
-    [('cat', categorical_pipe, categorical_columns),
+    [('cat', categorical_encoder, categorical_columns),
      ('num', numerical_pipe, numerical_columns)])
 
 rf = Pipeline([
@@ -122,8 +119,7 @@
 #   predictions that generalize to the test set (when the model has enough
 #   capacity).
 ohe = (rf.named_steps['preprocess']
-         .named_transformers_['cat']
-         .named_steps['onehot'])
+         .named_transformers_['cat'])
 feature_names = ohe.get_feature_names(input_features=categorical_columns)
 feature_names = np.r_[feature_names, numerical_columns]
 
 
@@ -27,7 +27,7 @@ class _BaseEncoder(TransformerMixin, BaseEstimator):
 
     """
 
-    def _check_X(self, X):
+    def _check_X(self, X, force_all_finite=True):
         """
         Perform custom check_array:
         - convert list of strings to object dtype
@@ -41,17 +41,19 @@ def _check_X(self, X):
         """
         if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
             # if not a dataframe, do normal check_array validation
-            X_temp = check_array(X, dtype=None)
+            X_temp = check_array(X, dtype=None,
+                                 force_all_finite=force_all_finite)
             if (not hasattr(X, 'dtype')
                     and np.issubdtype(X_temp.dtype, np.str_)):
-                X = check_array(X, dtype=object)
+                X = check_array(X, dtype=object,
+                                force_all_finite=force_all_finite)
             else:
                 X = X_temp
             needs_validation = False
         else:
             # pandas dataframe, do validation later column by column, in order
             # to keep the dtype information to be used in the encoder.
-            needs_validation = True
+            needs_validation = force_all_finite
 
         n_samples, n_features = X.shape
         X_columns = []
@@ -71,8 +73,9 @@ def _get_feature(self, X, feature_idx):
         # numpy arrays, sparse arrays
         return X[:, feature_idx]
 
-    def _fit(self, X, handle_unknown='error'):
-        X_list, n_samples, n_features = self._check_X(X)
+    def _fit(self, X, handle_unknown='error', force_all_finite=True):
+        X_list, n_samples, n_features = self._check_X(
+            X, force_all_finite=force_all_finite)
 
         if self.categories != 'auto':
             if len(self.categories) != n_features:
@@ -88,9 +91,16 @@ def _fit(self, X, handle_unknown='error'):
             else:
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
                 if Xi.dtype != object:
-                    if not np.all(np.sort(cats) == cats):
-                        raise ValueError("Unsorted categories are not "
-                                         "supported for numerical categories")
+                    sorted_cats = np.sort(cats)
+                    error_msg = ("Unsorted categories are not "
+                                 "supported for numerical categories")
+                    # if there are nans, nan should be the last element
+                    stop_idx = -1 if np.isnan(sorted_cats[-1]) else None
+                    if (np.any(sorted_cats[:stop_idx] != cats[:stop_idx]) or
+                        (np.isnan(sorted_cats[-1]) and
+                         not np.isnan(sorted_cats[-1]))):
+                        raise ValueError(error_msg)
+
                 if handle_unknown == 'error':
                     diff = _check_unknown(Xi, cats)
                     if diff:
@@ -99,8 +109,9 @@ def _fit(self, X, handle_unknown='error'):
                         raise ValueError(msg)
             self.categories_.append(cats)
 
-    def _transform(self, X, handle_unknown='error'):
-        X_list, n_samples, n_features = self._check_X(X)
+    def _transform(self, X, handle_unknown='error', force_all_finite=True):
+        X_list, n_samples, n_features = self._check_X(
+            X, force_all_finite=force_all_finite)
 
         X_int = np.zeros((n_samples, n_features), dtype=int)
         X_mask = np.ones((n_samples, n_features), dtype=bool)
@@ -355,8 +366,26 @@ def _compute_drop_idx(self):
                        "of features ({}), got {}")
                 raise ValueError(msg.format(len(self.categories_),
                                             len(self.drop)))
-            missing_drops = [(i, val) for i, val in enumerate(self.drop)
-                             if val not in self.categories_[i]]
+            missing_drops = []
+            drop_indices = []
+            for col_idx, (val, cat_list) in enumerate(zip(self.drop,
+                                                          self.categories_)):
+                if not is_scalar_nan(val):
+                    drop_idx = np.where(cat_list == val)[0]
+                    if drop_idx.size:  # found drop idx
+                        drop_indices.append(drop_idx[0])
+                    else:
+                        missing_drops.append((col_idx, val))
+                    continue
+
+                # val is nan, find nan in categories manually
+                for cat_idx, cat in enumerate(cat_list):
+                    if is_scalar_nan(cat):
+                        drop_indices.append(cat_idx)
+                        break
+                else:  # loop did not break thus drop is missing
+                    missing_drops.append((col_idx, val))
+
             if any(missing_drops):
                 msg = ("The following categories were supposed to be "
                        "dropped, but were not found in the training "
@@ -365,10 +394,7 @@ def _compute_drop_idx(self):
                                 ["Category: {}, Feature: {}".format(c, v)
                                     for c, v in missing_drops])))
                 raise ValueError(msg)
-            return np.array([np.where(cat_list == val)[0][0]
-                             for (val, cat_list) in
-                             zip(self.drop, self.categories_)],
-                            dtype=object)
+            return np.array(drop_indices, dtype=object)
 
     def fit(self, X, y=None):
         """
@@ -388,7 +414,8 @@ def fit(self, X, y=None):
         self
         """
         self._validate_keywords()
-        self._fit(X, handle_unknown=self.handle_unknown)
+        self._fit(X, handle_unknown=self.handle_unknown,
+                  force_all_finite='allow-nan')
         self.drop_idx_ = self._compute_drop_idx()
         return self
 
@@ -431,7 +458,8 @@ def transform(self, X):
         """
         check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
-        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
+        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown,
+                                        force_all_finite='allow-nan')
 
         n_samples, n_features = X_int.shape