scikit-learn · ogrisel · Jul 3, 2018 · Nov 27, 2017 · Jun 6, 2018 · Jun 6, 2018
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -16,7 +16,7 @@
 from ..utils import deprecated
 from ..utils.fixes import _argmax
 from ..utils.validation import check_is_fitted, FLOAT_DTYPES
-from .label import LabelEncoder
+from .label import _encode, _encode_check_unknown
 
 
 range = six.moves.range
@@ -104,32 +104,30 @@ def _fit(self, X, handle_unknown='error'):
         n_samples, n_features = X.shape
 
         if self._categories != 'auto':
-            for cats in self._categories:
-                if not np.all(np.sort(cats) == np.array(cats)):
-                    raise ValueError("Unsorted categories are not yet "
-                                     "supported")
+            if X.dtype != object:
+                for cats in self._categories:
+                    if not np.all(np.sort(cats) == np.array(cats)):
+                        raise ValueError("Unsorted categories are not "
+                                         "supported for numerical categories")
             if len(self._categories) != n_features:
                 raise ValueError("Shape mismatch: if n_values is an array,"
                                  " it has to be of shape (n_features,).")
 
-        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]
+        self.categories_ = []
 
         for i in range(n_features):
-            le = self._label_encoders_[i]
             Xi = X[:, i]
             if self._categories == 'auto':
-                le.fit(Xi)
+                cats = _encode(Xi)
             else:
-                if handle_unknown == 'error':
-                    valid_mask = np.in1d(Xi, self._categories[i])
-                    if not np.all(valid_mask):
-                        diff = np.unique(Xi[~valid_mask])
+                cats = np.array(self._categories[i], dtype=X.dtype)
+                if self.handle_unknown == 'error':
+                    diff = _encode_check_unknown(Xi, cats)
+                    if diff:
                         msg = ("Found unknown categories {0} in column {1}"
                                " during fit".format(diff, i))
                         raise ValueError(msg)
-                le.classes_ = np.array(self._categories[i], dtype=X.dtype)
-
-        self.categories_ = [le.classes_ for le in self._label_encoders_]
+            self.categories_.append(cats)
 
     def _transform(self, X, handle_unknown='error'):
 
@@ -145,11 +143,11 @@ def _transform(self, X, handle_unknown='error'):
 
         for i in range(n_features):
             Xi = X[:, i]
-            valid_mask = np.in1d(Xi, self.categories_[i])
+            diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
+                                                     return_mask=True)
 
             if not np.all(valid_mask):
                 if handle_unknown == 'error':
-                    diff = np.unique(X[~valid_mask, i])
                     msg = ("Found unknown categories {0} in column {1}"
                            " during transform".format(diff, i))
                     raise ValueError(msg)
@@ -160,7 +158,8 @@ def _transform(self, X, handle_unknown='error'):
                     X_mask[:, i] = valid_mask
                     Xi = Xi.copy()
                     Xi[~valid_mask] = self.categories_[i][0]
-            X_int[:, i] = self._label_encoders_[i].transform(Xi)
+            _, encoded = _encode(Xi, self.categories_[i], encode=True)
+            X_int[:, i] = encoded
 
         return X_int, X_mask
 
@@ -195,8 +194,9 @@ class OneHotEncoder(_BaseEncoder):
 
         - 'auto' : Determine categories automatically from the training data.
         - list : ``categories[i]`` holds the categories expected in the ith
-          column. The passed categories must be sorted and should not mix
-          strings and numeric values.
+          column. The passed categories should not mix strings and numeric
+          values within a single feature, and should be sorted in case of
+          numeric values.
 
         The used categories can be found in the ``categories_`` attribute.
 
@@ -713,8 +713,8 @@ class OrdinalEncoder(_BaseEncoder):
 
         - 'auto' : Determine categories automatically from the training data.
         - list : ``categories[i]`` holds the categories expected in the ith
-          column. The passed categories must be sorted and should not mix
-          strings and numeric values.
+          column. The passed categories should not mix strings and numeric
+          values, and should be sorted in case of numeric values.
 
         The used categories can be found in the ``categories_`` attribute.
 

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
@@ -37,6 +37,129 @@
 ]
 
 
+def _encode_numpy(values, uniques=None, encode=False):
+    # only used in _encode below, see docstring there for details
+    if uniques is None:
+        if encode:
+            uniques, encoded = np.unique(values, return_inverse=True)
+            return uniques, encoded
+        else:
+            # unique sorts
+            return np.unique(values)
+    if encode:
+        diff = _encode_check_unknown(values, uniques)
+        if diff:
+            raise ValueError("y contains previously unseen labels: %s"
+                             % str(diff))
+        encoded = np.searchsorted(uniques, values)
+        return uniques, encoded
+    else:
+        return uniques
+
+
+def _encode_python(values, uniques=None, encode=False):
+    # only used in _encode below, see docstring there for details
+    if uniques is None:
+        uniques = sorted(set(values))
+        uniques = np.array(uniques, dtype=values.dtype)
+    if encode:
+        table = {val: i for i, val in enumerate(uniques)}
+        try:
+            encoded = np.array([table[v] for v in values])
+        except KeyError as e:
+            raise ValueError("y contains previously unseen labels: %s"
+                             % str(e))
+        return uniques, encoded
+    else:
+        return uniques
+
+
+def _encode(values, uniques=None, encode=False):
+    """Helper function to factorize (find uniques) and encode values.
+
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+    The numpy method has the limitation that the `uniques` need to
+    be sorted. Importantly, this is not checked but assumed to already be
+    the case. The calling method needs to ensure this for all non-object
+    values.
+
+    Parameters
+    ----------
+    values : array
+        Values to factorize or encode.
+    uniques : array, optional
+        If passed, uniques are not determined from passed values (this
+        can be because the user specified categories, or because they
+        already have been determined in fit).
+    encode : bool, default False
+        If True, also encode the values into integer codes based on `uniques`.
+
+    Returns
+    -------
+    uniques
+        If ``encode=False``. The unique values are sorted if the `uniques`
+        parameter was None (and thus inferred from the data).
+    (uniques, encoded)
+        If ``encode=True``.
+
+    """
+    if values.dtype == object:
+        return _encode_python(values, uniques, encode)
+    else:
+        return _encode_numpy(values, uniques, encode)
+
+
+def _encode_check_unknown(values, uniques, return_mask=False):
+    """
+    Helper function to check for unknowns in values to be encoded.
+
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+
+    Parameters
+    ----------
+    values : array
+        Values to check for unknowns.
+    uniques : array
+        Allowed uniques values.
+    return_mask : bool, default False
+        If True, return a mask of the same shape as `values` indicating
+        the valid values.
+
+    Returns
+    -------
+    diff : list
+        The unique values present in `values` and not in `uniques` (the
+        unknown values).
+    valid_mask : boolean array
+        Additionally returned if ``return_mask=True``.
+
+    """
+    if values.dtype == object:
+        uniques_set = set(uniques)
+        diff = list(set(values) - uniques_set)
+        if return_mask:
+            if diff:
+                valid_mask = np.array([val in uniques_set for val in values])
+            else:
+                valid_mask = np.ones(len(values), dtype=bool)
+            return diff, valid_mask
+        else:
+            return diff
+    else:
+        unique_values = np.unique(values)
+        diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
+        if return_mask:
+            if diff:
+                valid_mask = np.in1d(values, uniques)
+            else:
+                valid_mask = np.ones(len(values), dtype=bool)
+            return diff, valid_mask
+        else:
+            return diff
+
+
 class LabelEncoder(BaseEstimator, TransformerMixin):
     """Encode labels with value between 0 and n_classes-1.
 
@@ -94,7 +217,7 @@ def fit(self, y):
         self : returns an instance of self.
         """
         y = column_or_1d(y, warn=True)
-        self.classes_ = np.unique(y)
+        self.classes_ = _encode(y)
         return self
 
     def fit_transform(self, y):
@@ -110,7 +233,7 @@ def fit_transform(self, y):
         y : array-like of shape [n_samples]
         """
         y = column_or_1d(y, warn=True)
-        self.classes_, y = np.unique(y, return_inverse=True)
+        self.classes_, y = _encode(y, encode=True)
         return y
 
     def transform(self, y):
@@ -131,12 +254,8 @@ def transform(self, y):
         if _num_samples(y) == 0:
             return np.array([])
 
-        classes = np.unique(y)
-        if len(np.intersect1d(classes, self.classes_)) < len(classes):
-            diff = np.setdiff1d(classes, self.classes_)
-            raise ValueError(
-                    "y contains previously unseen labels: %s" % str(diff))
-        return np.searchsorted(self.classes_, y)
+        _, y = _encode(y, uniques=self.classes_, encode=True)
+        return y
 
     def inverse_transform(self, y):
         """Transform labels back to original encoding.

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -339,10 +339,10 @@ def test_one_hot_encoder_set_params():
 
 
 def check_categorical_onehot(X):
-    enc = OneHotEncoder()
+    enc = OneHotEncoder(categories='auto')
     Xtr1 = enc.fit_transform(X)
 
-    enc = OneHotEncoder(sparse=False)
+    enc = OneHotEncoder(categories='auto', sparse=False)
     Xtr2 = enc.fit_transform(X)
 
     assert_allclose(Xtr1.toarray(), Xtr2)
@@ -351,17 +351,20 @@ def check_categorical_onehot(X):
     return Xtr1.toarray()
 
 
-def test_one_hot_encoder():
-    X = [['abc', 1, 55], ['def', 2, 55]]
-
+@pytest.mark.parametrize("X", [
+    [['def', 1, 55], ['abc', 2, 55]],
+    np.array([[10, 1, 55], [5, 2, 55]]),
+    np.array([['b', 'A', 'cat'], ['a', 'B', 'cat']], dtype=object)
+    ], ids=['mixed', 'numeric', 'object'])
+def test_one_hot_encoder(X):
     Xtr = check_categorical_onehot(np.array(X)[:, [0]])
-    assert_allclose(Xtr, [[1, 0], [0, 1]])
+    assert_allclose(Xtr, [[0, 1], [1, 0]])
 
     Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
-    assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
+    assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])
 
-    Xtr = OneHotEncoder().fit_transform(X)
-    assert_allclose(Xtr.toarray(), [[1, 0, 1, 0,  1], [0, 1, 0, 1, 1]])
+    Xtr = OneHotEncoder(categories='auto').fit_transform(X)
+    assert_allclose(Xtr.toarray(), [[0, 1, 1, 0,  1], [1, 0, 0, 1, 1]])
 
 
 def test_one_hot_encoder_inverse():
@@ -449,7 +452,8 @@ def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
     # when specifying categories manually, unknown categories should already
     # raise when fitting
     enc = OneHotEncoder(categories=cats)
-    assert_raises(ValueError, enc.fit, X2)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        enc.fit(X2)
     enc = OneHotEncoder(categories=cats, handle_unknown='ignore')
     exp = np.array([[1., 0., 0.], [0., 0., 0.]])
     assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
@@ -458,10 +462,20 @@ def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
 def test_one_hot_encoder_unsorted_categories():
     X = np.array([['a', 'b']], dtype=object).T
 
-    # unsorted passed categories raises for now
-    enc = OneHotEncoder(categories=[['c', 'b', 'a']])
-    msg = re.escape('Unsorted categories are not yet supported')
-    assert_raises_regex(ValueError, msg, enc.fit_transform, X)
+    enc = OneHotEncoder(categories=[['b', 'a', 'c']])
+    exp = np.array([[0., 1., 0.],
+                    [1., 0., 0.]])
+    assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+    assert enc.categories_[0].tolist() == ['b', 'a', 'c']
+    assert np.issubdtype(enc.categories_[0].dtype, np.object_)
+
+    # unsorted passed categories still raise for numerical values
+    X = np.array([[1, 2]]).T
+    enc = OneHotEncoder(categories=[[2, 1, 3]])
+    msg = 'Unsorted categories are not supported'
+    with pytest.raises(ValueError, match=msg):
+        enc.fit_transform(X)
 
 
 def test_one_hot_encoder_specified_categories_mixed_columns():
@@ -487,9 +501,12 @@ def test_one_hot_encoder_pandas():
     assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
 
 
-def test_ordinal_encoder():
-    X = [['abc', 2, 55], ['def', 1, 55]]
-
+@pytest.mark.parametrize("X", [
+    [['abc', 2, 55], ['def', 1, 55]],
+    np.array([[10, 2, 55], [20, 1, 55]]),
+    np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object)
+    ], ids=['mixed', 'numeric', 'object'])
+def test_ordinal_encoder(X):
     enc = OrdinalEncoder()
     exp = np.array([[0, 1, 0],
                     [1, 0, 0]], dtype='int64')