Merge pull request #3800 from MechCoder/unknown_transform

larsmans · larsmans · commit 9c1c811a4623 · 2014-10-26T16:00:42.000+01:00
Handle_unknown option to OneHotEncoder
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -343,7 +343,7 @@ Continuing the example above::
   >>> enc = preprocessing.OneHotEncoder()
   >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])  # doctest: +ELLIPSIS
   OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
-         n_values='auto', sparse=True)
+         handle_unknown='error', n_values='auto', sparse=True)
   >>> enc.transform([[0, 1, 3]]).toarray()
   array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])
 
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -87,6 +87,10 @@ Enhancements
    - Add a quantile prediction strategy to the :class:`dummy.DummyRegressor`.
      By `Aaron Staple`_.
 
+   - Add ``handle_unknown`` option to :class:`preprocessing.OneHotEncoder` to
+     handle unknown categorical features more gracefully during transform.
+     By `Manoj Kumar`_
+
 Documentation improvements
 ..........................
 
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -925,6 +925,10 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     sparse : boolean, default=True
         Will return sparse matrix if set True else will return an array.
 
+    handle_unknown : str, 'error' or 'ignore'
+        Whether to raise an error or ignore if a unknown categorical feature is
+        present during transform.
+
     Attributes
     ----------
     active_features_ : array
@@ -951,7 +955,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
 [1, 0, 2]])  # doctest: +ELLIPSIS
     OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
-           n_values='auto', sparse=True)
+           handle_unknown='error', n_values='auto', sparse=True)
     >>> enc.n_values_
     array([2, 3, 4])
     >>> enc.feature_indices_
@@ -967,11 +971,12 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
       encoding of dictionary items or strings.
     """
     def __init__(self, n_values="auto", categorical_features="all",
-                 dtype=np.float, sparse=True):
+                 dtype=np.float, sparse=True, handle_unknown='error'):
         self.n_values = n_values
         self.categorical_features = categorical_features
         self.dtype = dtype
         self.sparse = sparse
+        self.handle_unknown = handle_unknown
 
     def fit(self, X, y=None):
         """Fit OneHotEncoder to X.
@@ -1056,13 +1061,23 @@ def _transform(self, X):
                              " Expected %d, got %d."
                              % (indices.shape[0] - 1, n_features))
 
-        if (np.max(X, axis=0) >= self.n_values_).any():
-            raise ValueError("Feature out of bounds. Try setting n_values.")
-
-        column_indices = (X + indices[:-1]).ravel()
+        # We use only those catgorical features of X that are known using fit.
+        # i.e lesser than n_values_ using mask.
+        # This means, if self.handle_unknown is "ignore", the row_indices and
+        # col_indices corresponding to the unknown categorical feature are ignored.
+        mask = (X < self.n_values_).ravel()
+        if np.any(~mask):
+            if self.handle_unknown not in ['error', 'ignore']:
+                raise ValueError("handle_unknown should be either error or "
+                                 "unknown got %s" % self.handle_unknown)
+            if self.handle_unknown == 'error':
+                raise ValueError("unknown categorical feature present %s "
+                                 "during transform." % X[~mask])
+
+        column_indices = (X + indices[:-1]).ravel()[mask]
         row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
-                                n_features)
-        data = np.ones(n_samples * n_features)
+                                n_features)[mask]
+        data = np.ones(np.sum(mask))
         out = sparse.coo_matrix((data, (row_indices, column_indices)),
                                 shape=(n_samples, indices[-1]),
                                 dtype=self.dtype).tocsr()
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -802,3 +802,27 @@ def test_one_hot_encoder_categorical_features():
     # Edge case: all categorical
     cat = [True, True, True]
     _check_one_hot(X, X2, cat, 5)
+
+
+def test_one_hot_encoder_unknown_transform():
+    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
+    y = np.array([[4, 1, 1]])
+
+    # Test that one hot encoder raises error for unknown features
+    # present during transform.
+    oh = OneHotEncoder(handle_unknown='error')
+    oh.fit(X)
+    assert_raises(ValueError, oh.transform, y)
+
+    # Test the ignore option, ignores unknown features.
+    oh = OneHotEncoder(handle_unknown='ignore')
+    oh.fit(X)
+    assert_array_equal(
+        oh.transform(y).toarray(),
+        np.array([[ 0.,  0.,  0.,  0.,  1.,  0.,  0.]])
+        )
+
+    # Raise error if handle_unknown is neither ignore or error.
+    oh = OneHotEncoder(handle_unknown='42')
+    oh.fit(X)
+    assert_raises(ValueError, oh.transform, y)