scikit-learn
diff --git a/‎sklearn/preprocessing/data.py
Lines changed: 15 additions & 3 deletions b/‎sklearn/preprocessing/data.py
Lines changed: 15 additions & 3 deletions
diff --git a/‎sklearn/preprocessing/tests/test_data.py
Lines changed: 30 additions & 0 deletions b/‎sklearn/preprocessing/tests/test_data.py
Lines changed: 30 additions & 0 deletions
@@ -2462,11 +2462,12 @@ class CategoricalEncoder(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    categories : 'auto', 2D array of ints or strings or both.
+    categories : 'auto' or a list of lists/arrays of values.
         Values per feature.
 
         - 'auto' : Determine classes automatically from the training data.
-        - array: ``classes[i]`` holds the classes expected in the ith column.
+        - list : ``categories[i]`` holds the categories expected in the ith
+          column.
 
     dtype : number type, default=np.float
         Desired dtype of output.
@@ -2544,7 +2545,18 @@ def _fit(self, X):
             if self.categories == 'auto':
                 le.fit(X[:, i])
             else:
-                le.classes_ = np.array(self.categories[i])
+                if not np.all(np.in1d(X[:, i], self.categories[i])):
+                    if self.handle_unknown == 'error':
+                        diff = np.setdiff1d(X[:, i], self.categories[i])
+                        msg = 'Unknown feature(s) %s in column %d' % (diff, i)
+                        raise ValueError(msg)
+                le.classes_ = np.array(np.sort(self.categories[i]))
+
+    @staticmethod
+    def _check_unknown_categories(values, categories):
+        """Returns False if not all categories in the values are known"""
+        valid_mask = np.in1d(values, categories)
+        return np.all(valid_mask)
 
     def transform(self, X, y=None):
         """Encode the selected categorical features using the one-hot scheme.
 
@@ -2003,6 +2003,36 @@ def test_categorical_encoder_errors():
     assert_allclose(Xtr.toarray(), [[0, 0, 1, 0, 1, 0], [0, 1, 0, 1, 0, 1]])
 
 
+def test_categorical_encoder_specified_categories():
+    X = np.array([['a', 'b']], dtype=object).T
+
+    enc = CategoricalEncoder(categories=[['a', 'b', 'c']])
+    exp = np.array([[1., 0., 0.],
+                    [0., 1., 0.]])
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+
+    # don't follow order of passed categories, but sort them
+    enc = CategoricalEncoder(categories=[['c', 'b', 'a']])
+    exp = np.array([[1., 0., 0.],
+                    [0., 1., 0.]])
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+
+    # multiple columns
+    X = np.array([['a', 'b'], ['A', 'C']], dtype=object).T
+    enc = CategoricalEncoder(categories=[['a', 'b', 'c'], ['A', 'B', 'C']])
+    exp = np.array([[1., 0., 0., 1., 0., 0.],
+                    [0., 1., 0., 0., 0., 1.]])
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+
+    # when specifying categories manually, unknown categories should already
+    # raise when fitting
+    X = np.array([['a', 'b', 'c']]).T
+    enc = CategoricalEncoder(categories=[['a', 'b']])
+    assert_raises(ValueError, enc.fit, X)
+    enc = CategoricalEncoder(categories=[['a', 'b']], handle_unknown='ignore')
+    enc.fit(X)
+
+
 def test_fit_cold_start():
     X = iris.data
     X_2d = X[:, :2]