scikit-learn · Olamyy · Sep 6, 2018 · Sep 6, 2018 · Sep 8, 2018 · Sep 8, 2018
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -540,6 +540,18 @@ columns for this feature will be all zeros
     array([[1., 0., 0., 0., 0., 0.]])
 
 
+Missing categorical features in the training data can be handled by specifying what happens to them using the ``handle_missing`` parameter. The values for this can be one of :
+
+`all-missing`: This will replace all missing rows with NaN.
+`all-zero` : This will replace all missing rows with zeros.
+`categorical` : This will replace all missing rows as a representation of a separate one hot column.
+
+Note that, for scikit-learn to handle your missing values using OneHotEncoder,
+you have to pass a placeholder of what should be recorded as a missing value.
+This is the `missing_values` parameter and possible values can be either a
+`NaN` or a custom value of your choice.
+
+
 See :ref:`dict_feature_extraction` for categorical features that are represented
 as a dict, not as scalars.
 

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -20,10 +20,8 @@
 from .base import _transform_selected
 from .label import _encode, _encode_check_unknown
 
-
 range = six.moves.range
 
-
 __all__ = [
     'OneHotEncoder',
     'OrdinalEncoder'
@@ -218,6 +216,21 @@ class OneHotEncoder(_BaseEncoder):
             The ``n_values_`` attribute was deprecated in version
             0.20 and will be removed in 0.22.
 
+    handle_missing : all-missing, all-zero or category
+        What should be done to missing values. Should be one of:
+
+        'all-missing':
+            Replace with a row of NaNs
+
+        'all-zero:
+            Replace with a row of zeros
+
+        'category:
+            Represent with a separate one-hot column
+
+    missing_values: NaN or None
+        What should be considered as a missing value?
+
     Examples
     --------
     Given a dataset with two features, we let the encoder find the unique
@@ -260,13 +273,15 @@ class OneHotEncoder(_BaseEncoder):
 
     def __init__(self, n_values=None, categorical_features=None,
                  categories=None, sparse=True, dtype=np.float64,
-                 handle_unknown='error'):
+                 handle_unknown='error', missing_values="NaN", handle_missing="all-missing"):
         self.categories = categories
         self.sparse = sparse
         self.dtype = dtype
         self.handle_unknown = handle_unknown
         self.n_values = n_values
         self.categorical_features = categorical_features
+        self.missing_values = missing_values
+        self.handle_missing = handle_missing
 
     # Deprecated attributes
 
@@ -567,12 +582,30 @@ def transform(self, X):
         X_out : sparse matrix if sparse=True else a 2-d array
             Transformed input.
         """
-        if self._legacy_mode:
-            return _transform_selected(X, self._legacy_transform, self.dtype,
-                                       self._categorical_features,
-                                       copy=True)
+
+        if not self.handle_missing or self.handle_missing not in ["all-missing",
+                                                                  "all-zero", "category"]:
+            raise ValueError("Wrong 'handle_missing' value specified. "
+                             "'handle_missing' should be one of either "
+                             "['all-missing', 'all-zero', 'category']. "
+                             "Getting {0}".format(self.handle_missing))
+        missing_indices = np.argwhere(np.isnan(X)) if self.missing_values == "NaN" else \
+            np.argwhere(X == self.missing_values)
+        if self.handle_missing == "all-missing":
+            for i in missing_indices:
+                X[i] = np.nan
+        if self.handle_missing == "all-zero":
+            for i in missing_indices:
+                X[i] = 0
         else:
-            return self._transform_new(X)
+            # Replace with a seperate one-hot column
+            pass
+
+        if self._legacy_mode:
+            return _transform_selected(X, self._legacy_transform,
+                                       self.dtype,
+                                       self._categorical_features, copy=True)
+        return self._transform_new(X)
 
     def inverse_transform(self, X):
         """Convert the back data to the original representation.
@@ -659,7 +692,7 @@ def get_feature_names(self, input_features=None):
         cats = self.categories_
         if input_features is None:
             input_features = ['x%d' % i for i in range(len(cats))]
-        elif(len(input_features) != len(self.categories_)):
+        elif (len(input_features) != len(self.categories_)):
             raise ValueError(
                 "input_features should have length equal to number of "
                 "features ({}), got {}".format(len(self.categories_),

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -576,3 +576,35 @@ def test_one_hot_encoder_warning():
 def test_categorical_encoder_stub():
     from sklearn.preprocessing import CategoricalEncoder
     assert_raises(RuntimeError, CategoricalEncoder, encoding='ordinal')
+
+
+def test_one_hot_encoder_invalid_handle_missing():
+    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
+    y = np.array([[4, 1, 1]])
+     # Test that one hot encoder raises error for unknown features
+    # present during transform.
+    oh = OneHotEncoder(handle_unknown='error', handle_missing='abcde')
+    oh.fit(X)
+    assert_raises(ValueError, oh.transform, y)
+
+
+def test_one_hot_encoder_missing_values_none_handle_missing_passed():
+    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
+    y = np.array([[4, 1, 1]])
+     # Test that one hot encoder raises error for unknown features
+    # present during transform.
+    oh = OneHotEncoder(handle_unknown='error', missing_values=None,handle_missing='abcde')
+    oh.fit(X)
+    assert_raises(ValueError, oh.transform, y)
+
+
+def test_one_hot_encoder_handle_missing_all_zeros():
+    pass
+
+
+def test_one_hot_encoder_handle_missing_all_missing():
+    pass
+
+
+def test_one_hot_encoder_handle_missing_category():
+    pass