diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index dd1f798ccb3aa..04f9228750ece 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -540,6 +540,16 @@ columns for this feature will be all zeros
     array([[1., 0., 0., 0., 0., 0.]])
 
 
+Missing categorical features in the training data can be handled by specifying what happens to them using the ``handle_missing`` parameter. The values for this can be one of :
+
+`all-missing`: This will replace all missing rows with NaN.
+`all-zero` : This will replace all missing rows with zeros.
+`categorical` : This will replace all missing rows as a representation of a separate one hot column.
+
+Note that, for scikit-learn to handle your missing values using OneHotEncoder, you have to pass a placeholder of what should be recorded as a missing value. This is the `missing_values` parameter and possible values can be either a `NaN` or a custom value of your choice.
+
+
+
 See :ref:`dict_feature_extraction` for categorical features that are represented
 as a dict, not as scalars.
 
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index bd6e10fb62810..43cf8a1a97513 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -108,20 +108,16 @@ def _transform(self, X, handle_unknown='error'):
         return X_int, X_mask
 
 
-class OneHotEncoder(_BaseEncoder):
-    """Encode categorical integer features as a one-hot numeric array.
+class OneHotEncoder(BaseEstimator, TransformerMixin):
+    """Encode categorical integer features using a one-hot aka one-of-K scheme.
 
-    The input to this transformer should be an array-like of integers or
-    strings, denoting the values taken on by categorical (discrete) features.
-    The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
-    encoding scheme. This creates a binary column for each category and
-    returns a sparse matrix or dense array.
-
-    By default, the encoder derives the categories based on the unique values
-    in each feature. Alternatively, you can also specify the `categories`
-    manually.
-    The OneHotEncoder previously assumed that the input features take on
-    values in the range [0, max(values)). This behaviour is deprecated.
+    The input to this transformer should be a matrix of integers, denoting
+    the values taken on by categorical (discrete) features. The output will be
+    a sparse matrix where each column corresponds to one possible value of one
+    feature. It is assumed that input features take on values in the range
+    [0, n_values). For an encoder based on the unique values of the input
+    features of any type, see the
+    :class:`~sklearn.preprocessing.CategoricalEncoder`.
 
     This encoding is needed for feeding categorical data to many scikit-learn
     estimators, notably linear models and SVMs with the standard kernels.
@@ -133,31 +129,6 @@ class OneHotEncoder(_BaseEncoder):
 
     Parameters
     ----------
-    categories : 'auto' or a list of lists/arrays of values.
-        Categories (unique values) per feature:
-
-        - 'auto' : Determine categories automatically from the training data.
-        - list : ``categories[i]`` holds the categories expected in the ith
-          column. The passed categories should not mix strings and numeric
-          values within a single feature, and should be sorted in case of
-          numeric values.
-
-        The used categories can be found in the ``categories_`` attribute.
-
-    sparse : boolean, default=True
-        Will return sparse matrix if set True else will return an array.
-
-    dtype : number type, default=np.float
-        Desired dtype of output.
-
-    handle_unknown : 'error' (default) or 'ignore'
-        Whether to raise an error or ignore if an unknown categorical feature
-        is present during transform (default is to raise). When this parameter
-        is set to 'ignore' and an unknown category is encountered during
-        transform, the resulting one-hot encoded columns for this feature
-        will be all zeros. In the inverse transform, an unknown category
-        will be denoted as None.
-
     n_values : 'auto', int or array of ints
         Number of values per feature.
 
@@ -168,10 +139,6 @@ class OneHotEncoder(_BaseEncoder):
                   ``X[:, i]``. Each feature value should be
                   in ``range(n_values[i])``
 
-        .. deprecated:: 0.20
-            The `n_values` keyword was deprecated in version 0.20 and will
-            be removed in 0.22. Use `categories` instead.
-
     categorical_features : "all" or array of indices or mask
         Specify what features are treated as categorical.
 
@@ -181,72 +148,68 @@ class OneHotEncoder(_BaseEncoder):
 
         Non-categorical features are always stacked to the right of the matrix.
 
-        .. deprecated:: 0.20
-            The `categorical_features` keyword was deprecated in version
-            0.20 and will be removed in 0.22.
-            You can use the ``ColumnTransformer`` instead.
+    dtype : number type, default=np.float
+        Desired dtype of output.
+
+    sparse : boolean, default=True
+        Will return sparse matrix if set True else will return an array.
+
+    handle_unknown : str, 'error' or 'ignore'
+        Whether to raise an error or ignore if a unknown categorical feature is
+        present during transform.
 
     Attributes
     ----------
-    categories_ : list of arrays
-        The categories of each feature determined during fitting
-        (in order of the features in X and corresponding with the output
-        of ``transform``).
-
     active_features_ : array
         Indices for active features, meaning values that actually occur
         in the training set. Only available when n_values is ``'auto'``.
 
-        .. deprecated:: 0.20
-            The ``active_features_`` attribute was deprecated in version
-            0.20 and will be removed in 0.22.
-
     feature_indices_ : array of shape (n_features,)
         Indices to feature ranges.
         Feature ``i`` in the original data is mapped to features
         from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
-        (and then potentially masked by ``active_features_`` afterwards)
-
-        .. deprecated:: 0.20
-            The ``feature_indices_`` attribute was deprecated in version
-            0.20 and will be removed in 0.22.
+        (and then potentially masked by `active_features_` afterwards)
 
     n_values_ : array of shape (n_features,)
         Maximum number of values per feature.
 
-        .. deprecated:: 0.20
-            The ``n_values_`` attribute was deprecated in version
-            0.20 and will be removed in 0.22.
+    handle_missing : int, 0, 1, 2
+        What should be done to missing values. Should be one of:
+
+        all-missing: Replace with a row of NaNs as above
+
+        all-zero: Replace with a row of zeros
+
+        category: Represent with a separate one-hot column
+
+    missing_values: NaN or None
+        What should be considered as a missing value?
 
     Examples
     --------
-    Given a dataset with two features, we let the encoder find the unique
-    values per feature and transform the data to a binary one-hot encoding.
+    Given a dataset with three features and four samples, we let the encoder
+    find the maximum value per feature and transform the data to a binary
+    one-hot encoding.
 
     >>> from sklearn.preprocessing import OneHotEncoder
-    >>> enc = OneHotEncoder(handle_unknown='ignore')
-    >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
-    >>> enc.fit(X)
-    ... # doctest: +ELLIPSIS
-    OneHotEncoder(categorical_features=None, categories=None,
-           dtype=<... 'numpy.float64'>, handle_unknown='ignore',
-           n_values=None, sparse=True)
-
-    >>> enc.categories_
-    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
-    >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
-    array([[1., 0., 1., 0., 0.],
-           [0., 1., 0., 0., 0.]])
-    >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
-    array([['Male', 1],
-           [None, 2]], dtype=object)
-    >>> enc.get_feature_names()
-    array(['x0_Female', 'x0_Male', 'x1_1', 'x1_2', 'x1_3'], dtype=object)
+    >>> enc = OneHotEncoder()
+    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
+[1, 0, 2]])  # doctest: +ELLIPSIS
+    OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,
+           handle_unknown='error', n_values='auto', sparse=True)
+    >>> enc.n_values_
+    array([2, 3, 4])
+    >>> enc.feature_indices_
+    array([0, 2, 5, 9])
+    >>> enc.transform([[0, 1, 1]]).toarray()
+    array([[1., 0., 0., 1., 0., 0., 1., 0., 0.]])
 
     See also
     --------
-    sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer)
-      encoding of the categorical features.
+    sklearn.preprocessing.CategoricalEncoder : performs a one-hot or ordinal
+      encoding of all features (also handles string-valued features). This
+      encoder derives the categories based on the unique values in each
+      feature.
     sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
       dictionary items (also handles string-valued features).
     sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
@@ -256,125 +219,19 @@ class OneHotEncoder(_BaseEncoder):
     sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of
       iterables and a multilabel format, e.g. a (samples x classes) binary
       matrix indicating the presence of a class label.
+    sklearn.preprocessing.LabelEncoder : encodes labels with values between 0
+      and n_classes-1.
     """
 
-    def __init__(self, n_values=None, categorical_features=None,
-                 categories=None, sparse=True, dtype=np.float64,
-                 handle_unknown='error'):
-        self.categories = categories
-        self.sparse = sparse
-        self.dtype = dtype
-        self.handle_unknown = handle_unknown
+    def __init__(self, n_values="auto", categorical_features="all",
+                 dtype=np.float64, sparse=True, handle_unknown='error', missing_values=None, handle_missing=None):
         self.n_values = n_values
         self.categorical_features = categorical_features
-
-    # Deprecated attributes
-
-    @property
-    @deprecated("The ``active_features_`` attribute was deprecated in version "
-                "0.20 and will be removed 0.22.")
-    def active_features_(self):
-        check_is_fitted(self, 'categories_')
-        return self._active_features_
-
-    @property
-    @deprecated("The ``feature_indices_`` attribute was deprecated in version "
-                "0.20 and will be removed 0.22.")
-    def feature_indices_(self):
-        check_is_fitted(self, 'categories_')
-        return self._feature_indices_
-
-    @property
-    @deprecated("The ``n_values_`` attribute was deprecated in version "
-                "0.20 and will be removed 0.22.")
-    def n_values_(self):
-        check_is_fitted(self, 'categories_')
-        return self._n_values_
-
-    def _handle_deprecations(self, X):
-
-        # internal version of the attributes to handle deprecations
-        self._categories = getattr(self, '_categories', None)
-        self._categorical_features = getattr(self, '_categorical_features',
-                                             None)
-
-        # user manually set the categories or second fit -> never legacy mode
-        if self.categories is not None or self._categories is not None:
-            self._legacy_mode = False
-            if self.categories is not None:
-                self._categories = self.categories
-
-        # categories not set -> infer if we need legacy mode or not
-        elif self.n_values is not None and self.n_values != 'auto':
-            msg = (
-                "Passing 'n_values' is deprecated in version 0.20 and will be "
-                "removed in 0.22. You can use the 'categories' keyword "
-                "instead. 'n_values=n' corresponds to 'categories=[range(n)]'."
-            )
-            warnings.warn(msg, DeprecationWarning)
-            self._legacy_mode = True
-
-        else:  # n_values = 'auto'
-            if self.handle_unknown == 'ignore':
-                # no change in behaviour, no need to raise deprecation warning
-                self._legacy_mode = False
-                self._categories = 'auto'
-                if self.n_values == 'auto':
-                    # user manually specified this
-                    msg = (
-                        "Passing 'n_values' is deprecated in version 0.20 and "
-                        "will be removed in 0.22. n_values='auto' can be "
-                        "replaced with categories='auto'."
-                    )
-                    warnings.warn(msg, DeprecationWarning)
-            else:
-
-                # check if we have integer or categorical input
-                try:
-                    X = check_array(X, dtype=np.int)
-                except ValueError:
-                    self._legacy_mode = False
-                    self._categories = 'auto'
-                else:
-                    msg = (
-                        "The handling of integer data will change in version "
-                        "0.22. Currently, the categories are determined "
-                        "based on the range [0, max(values)], while in the "
-                        "future they will be determined based on the unique "
-                        "values.\nIf you want the future behaviour and "
-                        "silence this warning, you can specify "
-                        "\"categories='auto'\".\n"
-                        "In case you used a LabelEncoder before this "
-                        "OneHotEncoder to convert the categories to integers, "
-                        "then you can now use the OneHotEncoder directly."
-                    )
-                    warnings.warn(msg, FutureWarning)
-                    self._legacy_mode = True
-                    self.n_values = 'auto'
-
-        # if user specified categorical_features -> always use legacy mode
-        if self.categorical_features is not None:
-            if (isinstance(self.categorical_features, six.string_types)
-                    and self.categorical_features == 'all'):
-                warnings.warn(
-                    "The 'categorical_features' keyword is deprecated in "
-                    "version 0.20 and will be removed in 0.22. The passed "
-                    "value of 'all' is the default and can simply be removed.",
-                    DeprecationWarning)
-            else:
-                if self.categories is not None:
-                    raise ValueError(
-                        "The 'categorical_features' keyword is deprecated, "
-                        "and cannot be used together with specifying "
-                        "'categories'.")
-                warnings.warn(
-                    "The 'categorical_features' keyword is deprecated in "
-                    "version 0.20 and will be removed in 0.22. You can "
-                    "use the ColumnTransformer instead.", DeprecationWarning)
-                self._legacy_mode = True
-            self._categorical_features = self.categorical_features
-        else:
-            self._categorical_features = 'all'
+        self.dtype = dtype
+        self.sparse = sparse
+        self.handle_unknown = handle_unknown
+        self.handle_missing = handle_missing
+        self.missing_values = missing_values
 
     def fit(self, X, y=None):
         """Fit OneHotEncoder to X.
@@ -382,31 +239,17 @@ def fit(self, X, y=None):
         Parameters
         ----------
         X : array-like, shape [n_samples, n_feature]
-            The data to determine the categories of each feature.
+            Input array of type int.
 
         Returns
         -------
         self
         """
-        if self.handle_unknown not in ('error', 'ignore'):
-            msg = ("handle_unknown should be either 'error' or 'ignore', "
-                   "got {0}.".format(self.handle_unknown))
-            raise ValueError(msg)
-
-        self._handle_deprecations(X)
-
-        if self._legacy_mode:
-            _transform_selected(X, self._legacy_fit_transform, self.dtype,
-                                self._categorical_features,
-                                copy=True)
-            return self
-        else:
-            self._fit(X, handle_unknown=self.handle_unknown)
-            return self
+        self.fit_transform(X)
+        return self
 
-    def _legacy_fit_transform(self, X):
+    def _fit_transform(self, X):
         """Assumes X contains only categorical features."""
-        dtype = getattr(X, 'dtype', None)
         X = check_array(X, dtype=np.int)
         if np.any(X < 0):
             raise ValueError("X needs to contain only non-negative integers.")
@@ -431,12 +274,10 @@ def _legacy_fit_transform(self, X):
                 raise ValueError("Shape mismatch: if n_values is an array,"
                                  " it has to be of shape (n_features,).")
 
-        self._n_values_ = n_values
-        self.categories_ = [np.arange(n_val - 1, dtype=dtype)
-                            for n_val in n_values]
+        self.n_values_ = n_values
         n_values = np.hstack([[0], n_values])
         indices = np.cumsum(n_values)
-        self._feature_indices_ = indices
+        self.feature_indices_ = indices
 
         column_indices = (X + indices[:-1]).ravel()
         row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
@@ -451,11 +292,7 @@ def _legacy_fit_transform(self, X):
             mask = np.array(out.sum(axis=0)).ravel() != 0
             active_features = np.where(mask)[0]
             out = out[:, active_features]
-            self._active_features_ = active_features
-
-            self.categories_ = [
-                np.unique(X[:, i]).astype(dtype) if dtype
-                else np.unique(X[:, i]) for i in range(n_features)]
+            self.active_features_ = active_features
 
         return out if self.sparse else out.toarray()
 
@@ -470,28 +307,36 @@ def fit_transform(self, X, y=None):
         X : array-like, shape [n_samples, n_feature]
             Input array of type int.
         """
-        if self.handle_unknown not in ('error', 'ignore'):
-            msg = ("handle_unknown should be either 'error' or 'ignore', "
-                   "got {0}.".format(self.handle_unknown))
-            raise ValueError(msg)
-
-        self._handle_deprecations(X)
-
-        if self._legacy_mode:
-            return _transform_selected(
-                X, self._legacy_fit_transform, self.dtype,
-                self._categorical_features, copy=True)
-        else:
-            return self.fit(X).transform(X)
+        if not self.missing_values:
+            return _transform_selected(X, self._fit_transform,
+                                       self.categorical_features, copy=True)
+        if self.missing_values and self.missing_values != "NaN":
+            raise ValueError("Wrong 'missing_missing' value specified. "
+                             "'missing_values' should be one of either 'None' or 'NaN'")
+        if self.missing_values == "NaN":
+            if not self.handle_missing:
+                raise ValueError("'handle_missing' cannot be None when 'missing_values' is passed.")
+            if self.handle_missing not in ["all-missing", "all-zero", "category"]:
+                raise ValueError("Wrong 'handle_missing' value specified. "
+                                 "'handle_missing' should be one of either ['all-missing', 'all-zero', 'category']")
+            if self.handle_missing == "all-missing":
+                # Replace entire row with NaN
+                pass
+            if self.handle_missing == "all-zero":
+                # Replace with a row of zeros
+                pass
+            else:
+                # Replace with a seperate one-hot column
+                pass
 
-    def _legacy_transform(self, X):
+    def _transform(self, X):
         """Assumes X contains only categorical features."""
         X = check_array(X, dtype=np.int)
         if np.any(X < 0):
             raise ValueError("X needs to contain only non-negative integers.")
         n_samples, n_features = X.shape
 
-        indices = self._feature_indices_
+        indices = self.feature_indices_
         if n_features != indices.shape[0] - 1:
             raise ValueError("X has different shape than during fitting."
                              " Expected %d, got %d."
@@ -502,7 +347,7 @@ def _legacy_transform(self, X):
         # This means, if self.handle_unknown is "ignore", the row_indices and
         # col_indices corresponding to the unknown categorical feature are
         # ignored.
-        mask = (X < self._n_values_).ravel()
+        mask = (X < self.n_values_).ravel()
         if np.any(~mask):
             if self.handle_unknown not in ['error', 'ignore']:
                 raise ValueError("handle_unknown should be either error or "
@@ -520,158 +365,25 @@ def _legacy_transform(self, X):
                                 dtype=self.dtype).tocsr()
         if (isinstance(self.n_values, six.string_types) and
                 self.n_values == 'auto'):
-            out = out[:, self._active_features_]
+            out = out[:, self.active_features_]
 
         return out if self.sparse else out.toarray()
 
-    def _transform_new(self, X):
-        """New implementation assuming categorical input"""
-        X_temp = check_array(X, dtype=None)
-        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
-            X = check_array(X, dtype=np.object)
-        else:
-            X = X_temp
-
-        n_samples, n_features = X.shape
-
-        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
-
-        mask = X_mask.ravel()
-        n_values = [cats.shape[0] for cats in self.categories_]
-        n_values = np.array([0] + n_values)
-        feature_indices = np.cumsum(n_values)
-
-        indices = (X_int + feature_indices[:-1]).ravel()[mask]
-        indptr = X_mask.sum(axis=1).cumsum()
-        indptr = np.insert(indptr, 0, 0)
-        data = np.ones(n_samples * n_features)[mask]
-
-        out = sparse.csr_matrix((data, indices, indptr),
-                                shape=(n_samples, feature_indices[-1]),
-                                dtype=self.dtype)
-        if not self.sparse:
-            return out.toarray()
-        else:
-            return out
-
     def transform(self, X):
         """Transform X using one-hot encoding.
 
         Parameters
         ----------
         X : array-like, shape [n_samples, n_features]
-            The data to encode.
+            Input array of type int.
 
         Returns
         -------
-        X_out : sparse matrix if sparse=True else a 2-d array
+        X_out : sparse matrix if sparse=True else a 2-d array, dtype=int
             Transformed input.
         """
-        if self._legacy_mode:
-            return _transform_selected(X, self._legacy_transform, self.dtype,
-                                       self._categorical_features,
-                                       copy=True)
-        else:
-            return self._transform_new(X)
-
-    def inverse_transform(self, X):
-        """Convert the back data to the original representation.
-
-        In case unknown categories are encountered (all zero's in the
-        one-hot encoding), ``None`` is used to represent this category.
-
-        Parameters
-        ----------
-        X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
-            The transformed data.
-
-        Returns
-        -------
-        X_tr : array-like, shape [n_samples, n_features]
-            Inverse transformed array.
-
-        """
-        # if self._legacy_mode:
-        #     raise ValueError("only supported for categorical features")
-
-        check_is_fitted(self, 'categories_')
-        X = check_array(X, accept_sparse='csr')
-
-        n_samples, _ = X.shape
-        n_features = len(self.categories_)
-        n_transformed_features = sum([len(cats) for cats in self.categories_])
-
-        # validate shape of passed X
-        msg = ("Shape of the passed X data is not correct. Expected {0} "
-               "columns, got {1}.")
-        if X.shape[1] != n_transformed_features:
-            raise ValueError(msg.format(n_transformed_features, X.shape[1]))
-
-        # create resulting array of appropriate dtype
-        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
-        X_tr = np.empty((n_samples, n_features), dtype=dt)
-
-        j = 0
-        found_unknown = {}
-
-        for i in range(n_features):
-            n_categories = len(self.categories_[i])
-            sub = X[:, j:j + n_categories]
-
-            # for sparse X argmax returns 2D matrix, ensure 1D array
-            labels = np.asarray(_argmax(sub, axis=1)).flatten()
-            X_tr[:, i] = self.categories_[i][labels]
-
-            if self.handle_unknown == 'ignore':
-                # ignored unknown categories: we have a row of all zero's
-                unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
-                if unknown.any():
-                    found_unknown[i] = unknown
-
-            j += n_categories
-
-        # if ignored are found: potentially need to upcast result to
-        # insert None values
-        if found_unknown:
-            if X_tr.dtype != object:
-                X_tr = X_tr.astype(object)
-
-            for idx, mask in found_unknown.items():
-                X_tr[mask, idx] = None
-
-        return X_tr
-
-    def get_feature_names(self, input_features=None):
-        """Return feature names for output features.
-
-        Parameters
-        ----------
-        input_features : list of string, length n_features, optional
-            String names for input features if available. By default,
-            "x0", "x1", ... "xn_features" is used.
-
-        Returns
-        -------
-        output_feature_names : array of string, length n_output_features
-
-        """
-        check_is_fitted(self, 'categories_')
-        cats = self.categories_
-        if input_features is None:
-            input_features = ['x%d' % i for i in range(len(cats))]
-        elif(len(input_features) != len(self.categories_)):
-            raise ValueError(
-                "input_features should have length equal to number of "
-                "features ({}), got {}".format(len(self.categories_),
-                                               len(input_features)))
-
-        feature_names = []
-        for i in range(len(cats)):
-            names = [
-                input_features[i] + '_' + six.text_type(t) for t in cats[i]]
-            feature_names.extend(names)
-
-        return np.array(feature_names, dtype=object)
+        return _transform_selected(X, self._transform,
+                                   self.categorical_features, copy=True)
 
 
 class OrdinalEncoder(_BaseEncoder):
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 0a33f9140f902..656cdb879f94e 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -2916,4 +2916,4 @@ def __init__(*args, **kwargs):
         raise RuntimeError(
             "CategoricalEncoder briefly existed in 0.20dev. Its functionality "
             "has been rolled into the OneHotEncoder and OrdinalEncoder. "
-            "This stub will be removed in version 0.21.")
+            "This stub will be removed in version 0.21.")
\ No newline at end of file
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index f4d0b5af9799f..8f603686033ce 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -7,7 +7,6 @@
 
 import warnings
 import re
-import itertools
 
 import numpy as np
 import numpy.linalg as la
@@ -33,15 +32,18 @@
 from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import assert_no_warnings
 from sklearn.utils.testing import assert_allclose
-from sklearn.utils.testing import assert_allclose_dense_sparse
 from sklearn.utils.testing import skip_if_32bit
+from sklearn.utils.testing import SkipTest
 
 from sklearn.utils.sparsefuncs import mean_variance_axis
+from sklearn.preprocessing.data import _transform_selected
 from sklearn.preprocessing.data import _handle_zeros_in_scale
 from sklearn.preprocessing.data import Binarizer
 from sklearn.preprocessing.data import KernelCenterer
 from sklearn.preprocessing.data import Normalizer
 from sklearn.preprocessing.data import normalize
+from sklearn.preprocessing.data import OneHotEncoder
+from sklearn.preprocessing.data import CategoricalEncoder
 from sklearn.preprocessing.data import StandardScaler
 from sklearn.preprocessing.data import scale
 from sklearn.preprocessing.data import MinMaxScaler
@@ -58,11 +60,9 @@
 from sklearn.preprocessing.data import power_transform
 from sklearn.exceptions import DataConversionWarning, NotFittedError
 
-from sklearn.base import clone
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import cross_val_predict
 from sklearn.svm import SVR
-from sklearn.utils import shuffle
 
 from sklearn import datasets
 
@@ -210,7 +210,7 @@ def test_standard_scaler_1d():
         assert_array_almost_equal(X_scaled_back, X)
 
     # Constant feature
-    X = np.ones((5, 1))
+    X = np.ones(5).reshape(5, 1)
     scaler = StandardScaler()
     X_scaled = scaler.fit(X).transform(X, copy=True)
     assert_almost_equal(scaler.mean_, 1.)
@@ -238,7 +238,7 @@ def test_standard_scaler_numerical_stability():
     # np.log(1e-5) is taken because of its floating point representation
     # was empirically found to cause numerical problems with np.mean & np.std.
 
-    x = np.full(8, np.log(1e-5), dtype=np.float64)
+    x = np.zeros(8, dtype=np.float64) + np.log(1e-5, dtype=np.float64)
     if LooseVersion(np.__version__) >= LooseVersion('1.9'):
         # This does not raise a warning as the number of samples is too low
         # to trigger the problem in recent numpy
@@ -250,17 +250,17 @@ def test_standard_scaler_numerical_stability():
         assert_array_almost_equal(x_scaled, np.zeros(8))
 
     # with 2 more samples, the std computation run into numerical issues:
-    x = np.full(10, np.log(1e-5), dtype=np.float64)
+    x = np.zeros(10, dtype=np.float64) + np.log(1e-5, dtype=np.float64)
     w = "standard deviation of the data is probably very close to 0"
     x_scaled = assert_warns_message(UserWarning, w, scale, x)
     assert_array_almost_equal(x_scaled, np.zeros(10))
 
-    x = np.full(10, 1e-100, dtype=np.float64)
+    x = np.ones(10, dtype=np.float64) * 1e-100
     x_small_scaled = assert_no_warnings(scale, x)
     assert_array_almost_equal(x_small_scaled, np.zeros(10))
 
     # Large values can cause (often recoverable) numerical stability issues:
-    x_big = np.full(10, 1e100, dtype=np.float64)
+    x_big = np.ones(10, dtype=np.float64) * 1e100
     w = "Dataset may contain too large values"
     x_big_scaled = assert_warns_message(UserWarning, w, scale, x_big)
     assert_array_almost_equal(x_big_scaled, np.zeros(10))
@@ -511,7 +511,7 @@ def test_standard_scaler_trasform_with_partial_fit():
         assert_array_almost_equal(X_sofar, right_input)
 
         zero = np.zeros(X.shape[1])
-        epsilon = np.finfo(float).eps
+        epsilon = np.nextafter(0, 1)
         assert_array_less(zero, scaler_incr.var_ + epsilon)  # as less or equal
         assert_array_less(zero, scaler_incr.scale_ + epsilon)
         # (i+1) because the Scaler has been already fitted
@@ -622,7 +622,7 @@ def test_min_max_scaler_1d():
         assert_array_almost_equal(X_scaled_back, X)
 
     # Constant feature
-    X = np.ones((5, 1))
+    X = np.ones(5).reshape(5, 1)
     scaler = MinMaxScaler()
     X_scaled = scaler.fit(X).transform(X)
     assert_greater_equal(X_scaled.min(), 0.)
@@ -701,85 +701,6 @@ def test_scaler_without_centering():
     assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
 
 
-@pytest.mark.parametrize("with_mean", [True, False])
-@pytest.mark.parametrize("with_std", [True, False])
-@pytest.mark.parametrize("array_constructor",
-                         [np.asarray, sparse.csc_matrix, sparse.csr_matrix])
-def test_scaler_n_samples_seen_with_nan(with_mean, with_std,
-                                        array_constructor):
-    X = np.array([[0, 1, 3],
-                  [np.nan, 6, 10],
-                  [5, 4, np.nan],
-                  [8, 0, np.nan]],
-                 dtype=np.float64)
-    X = array_constructor(X)
-
-    if sparse.issparse(X) and with_mean:
-        pytest.skip("'with_mean=True' cannot be used with sparse matrix.")
-
-    transformer = StandardScaler(with_mean=with_mean, with_std=with_std)
-    transformer.fit(X)
-
-    assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2]))
-
-
-def _check_identity_scalers_attributes(scaler_1, scaler_2):
-    assert scaler_1.mean_ is scaler_2.mean_ is None
-    assert scaler_1.var_ is scaler_2.var_ is None
-    assert scaler_1.scale_ is scaler_2.scale_ is None
-    assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_
-
-
-def test_scaler_return_identity():
-    # test that the scaler return identity when with_mean and with_std are
-    # False
-    X_dense = np.array([[0, 1, 3],
-                        [5, 6, 0],
-                        [8, 0, 10]],
-                       dtype=np.float64)
-    X_csr = sparse.csr_matrix(X_dense)
-    X_csc = X_csr.tocsc()
-
-    transformer_dense = StandardScaler(with_mean=False, with_std=False)
-    X_trans_dense = transformer_dense.fit_transform(X_dense)
-
-    transformer_csr = clone(transformer_dense)
-    X_trans_csr = transformer_csr.fit_transform(X_csr)
-
-    transformer_csc = clone(transformer_dense)
-    X_trans_csc = transformer_csc.fit_transform(X_csc)
-
-    assert_allclose_dense_sparse(X_trans_csr, X_csr)
-    assert_allclose_dense_sparse(X_trans_csc, X_csc)
-    assert_allclose(X_trans_dense, X_dense)
-
-    for trans_1, trans_2 in itertools.combinations([transformer_dense,
-                                                    transformer_csr,
-                                                    transformer_csc],
-                                                   2):
-        _check_identity_scalers_attributes(trans_1, trans_2)
-
-    transformer_dense.partial_fit(X_dense)
-    transformer_csr.partial_fit(X_csr)
-    transformer_csc.partial_fit(X_csc)
-
-    for trans_1, trans_2 in itertools.combinations([transformer_dense,
-                                                    transformer_csr,
-                                                    transformer_csc],
-                                                   2):
-        _check_identity_scalers_attributes(trans_1, trans_2)
-
-    transformer_dense.fit(X_dense)
-    transformer_csr.fit(X_csr)
-    transformer_csc.fit(X_csc)
-
-    for trans_1, trans_2 in itertools.combinations([transformer_dense,
-                                                    transformer_csr,
-                                                    transformer_csc],
-                                                   2):
-        _check_identity_scalers_attributes(trans_1, trans_2)
-
-
 def test_scaler_int():
     # test that scaler converts integer input to floating
     # for both sparse and dense matrices
@@ -901,56 +822,15 @@ def test_scale_sparse_with_mean_raise_exception():
 
 def test_scale_input_finiteness_validation():
     # Check if non finite inputs raise ValueError
-    X = [[np.inf, 5, 6, 7, 8]]
+    X = [[np.nan, 5, 6, 7, 8]]
     assert_raises_regex(ValueError,
-                        "Input contains infinity or a value too large",
+                        "Input contains NaN, infinity or a value too large",
                         scale, X)
 
-
-def test_robust_scaler_error_sparse():
-    X_sparse = sparse.rand(1000, 10)
-    scaler = RobustScaler(with_centering=True)
-    err_msg = "Cannot center sparse matrices"
-    with pytest.raises(ValueError, match=err_msg):
-        scaler.fit(X_sparse)
-
-
-@pytest.mark.parametrize("with_centering", [True, False])
-@pytest.mark.parametrize("with_scaling", [True, False])
-@pytest.mark.parametrize("X", [np.random.randn(10, 3),
-                               sparse.rand(10, 3, density=0.5)])
-def test_robust_scaler_attributes(X, with_centering, with_scaling):
-    # check consistent type of attributes
-    if with_centering and sparse.issparse(X):
-        pytest.skip("RobustScaler cannot center sparse matrix")
-
-    scaler = RobustScaler(with_centering=with_centering,
-                          with_scaling=with_scaling)
-    scaler.fit(X)
-
-    if with_centering:
-        assert isinstance(scaler.center_, np.ndarray)
-    else:
-        assert scaler.center_ is None
-    if with_scaling:
-        assert isinstance(scaler.scale_, np.ndarray)
-    else:
-        assert scaler.scale_ is None
-
-
-def test_robust_scaler_col_zero_sparse():
-    # check that the scaler is working when there is not data materialized in a
-    # column of a sparse matrix
-    X = np.random.randn(10, 5)
-    X[:, 0] = 0
-    X = sparse.csr_matrix(X)
-
-    scaler = RobustScaler(with_centering=False)
-    scaler.fit(X)
-    assert scaler.scale_[0] == pytest.approx(1)
-
-    X_trans = scaler.transform(X)
-    assert_allclose(X[:, 0].toarray(), X_trans[:, 0].toarray())
+    X = [[np.inf, 5, 6, 7, 8]]
+    assert_raises_regex(ValueError,
+                        "Input contains NaN, infinity or a value too large",
+                        scale, X)
 
 
 def test_robust_scaler_2d_arrays():
@@ -966,29 +846,6 @@ def test_robust_scaler_2d_arrays():
     assert_array_almost_equal(X_scaled.std(axis=0)[0], 0)
 
 
-@pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1])
-@pytest.mark.parametrize("strictly_signed",
-                         ['positive', 'negative', 'zeros', None])
-def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed):
-    # Check the equivalence of the fitting with dense and sparse matrices
-    X_sparse = sparse.rand(1000, 5, density=density).tocsc()
-    if strictly_signed == 'positive':
-        X_sparse.data = np.abs(X_sparse.data)
-    elif strictly_signed == 'negative':
-        X_sparse.data = - np.abs(X_sparse.data)
-    elif strictly_signed == 'zeros':
-        X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64)
-    X_dense = X_sparse.toarray()
-
-    scaler_sparse = RobustScaler(with_centering=False)
-    scaler_dense = RobustScaler(with_centering=False)
-
-    scaler_sparse.fit(X_sparse)
-    scaler_dense.fit(X_dense)
-
-    assert_allclose(scaler_sparse.scale_, scaler_dense.scale_)
-
-
 def test_robust_scaler_transform_one_row_csr():
     # Check RobustScaler on transforming csr matrix with one row
     rng = np.random.RandomState(0)
@@ -1578,7 +1435,7 @@ def test_maxabs_scaler_1d():
         assert_array_almost_equal(X_scaled_back, X)
 
     # Constant feature
-    X = np.ones((5, 1))
+    X = np.ones(5).reshape(5, 1)
     scaler = MaxAbsScaler()
     X_scaled = scaler.fit(X).transform(X)
     assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.)
@@ -1979,6 +1836,450 @@ def test_add_dummy_feature_csr():
     assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
 
 
+def test_one_hot_encoder_sparse():
+    # Test OneHotEncoder's fit and transform.
+    X = [[3, 2, 1], [0, 1, 1]]
+    enc = OneHotEncoder()
+    # discover max values automatically
+    X_trans = enc.fit_transform(X).toarray()
+    assert_equal(X_trans.shape, (2, 5))
+    assert_array_equal(enc.active_features_,
+                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
+    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
+
+    # check outcome
+    assert_array_equal(X_trans,
+                       [[0., 1., 0., 1., 1.],
+                        [1., 0., 1., 0., 1.]])
+
+    # max value given as 3
+    enc = OneHotEncoder(n_values=4)
+    X_trans = enc.fit_transform(X)
+    assert_equal(X_trans.shape, (2, 4 * 3))
+    assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])
+
+    # max value given per feature
+    enc = OneHotEncoder(n_values=[3, 2, 2])
+    X = [[1, 0, 1], [0, 1, 1]]
+    X_trans = enc.fit_transform(X)
+    assert_equal(X_trans.shape, (2, 3 + 2 + 2))
+    assert_array_equal(enc.n_values_, [3, 2, 2])
+    # check that testing with larger feature works:
+    X = np.array([[2, 0, 1], [0, 1, 1]])
+    enc.transform(X)
+
+    # test that an error is raised when out of bounds:
+    X_too_large = [[0, 2, 1], [0, 1, 1]]
+    assert_raises(ValueError, enc.transform, X_too_large)
+    error_msg = r"unknown categorical feature present \[2\] during transform."
+    assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large)
+    assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X)
+
+    # test that error is raised when wrong number of features
+    assert_raises(ValueError, enc.transform, X[:, :-1])
+    # test that error is raised when wrong number of features in fit
+    # with prespecified n_values
+    assert_raises(ValueError, enc.fit, X[:, :-1])
+    # test exception on wrong init param
+    assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)
+
+    enc = OneHotEncoder()
+    # test negative input to fit
+    assert_raises(ValueError, enc.fit, [[0], [-1]])
+
+    # test negative input to transform
+    enc.fit([[0], [1]])
+    assert_raises(ValueError, enc.transform, [[0], [-1]])
+
+
+def test_one_hot_encoder_dense():
+    # check for sparse=False
+    X = [[3, 2, 1], [0, 1, 1]]
+    enc = OneHotEncoder(sparse=False)
+    # discover max values automatically
+    X_trans = enc.fit_transform(X)
+    assert_equal(X_trans.shape, (2, 5))
+    assert_array_equal(enc.active_features_,
+                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
+    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
+
+    # check outcome
+    assert_array_equal(X_trans,
+                       np.array([[0., 1., 0., 1., 1.],
+                                 [1., 0., 1., 0., 1.]]))
+
+
+def _check_transform_selected(X, X_expected, sel):
+    for M in (X, sparse.csr_matrix(X)):
+        Xtr = _transform_selected(M, Binarizer().transform, sel)
+        assert_array_equal(toarray(Xtr), X_expected)
+
+
+def test_transform_selected():
+    X = [[3, 2, 1], [0, 1, 1]]
+
+    X_expected = [[1, 2, 1], [0, 1, 1]]
+    _check_transform_selected(X, X_expected, [0])
+    _check_transform_selected(X, X_expected, [True, False, False])
+
+    X_expected = [[1, 1, 1], [0, 1, 1]]
+    _check_transform_selected(X, X_expected, [0, 1, 2])
+    _check_transform_selected(X, X_expected, [True, True, True])
+    _check_transform_selected(X, X_expected, "all")
+
+    _check_transform_selected(X, X, [])
+    _check_transform_selected(X, X, [False, False, False])
+
+
+def test_transform_selected_copy_arg():
+    # transformer that alters X
+    def _mutating_transformer(X):
+        X[0, 0] = X[0, 0] + 1
+        return X
+
+    original_X = np.asarray([[1, 2], [3, 4]])
+    expected_Xtr = [[2, 2], [3, 4]]
+
+    X = original_X.copy()
+    Xtr = _transform_selected(X, _mutating_transformer, copy=True,
+                              selected='all')
+
+    assert_array_equal(toarray(X), toarray(original_X))
+    assert_array_equal(toarray(Xtr), expected_Xtr)
+
+
+def _run_one_hot(X, X2, cat):
+    enc = OneHotEncoder(categorical_features=cat)
+    Xtr = enc.fit_transform(X)
+    X2tr = enc.transform(X2)
+    return Xtr, X2tr
+
+
+def _check_one_hot(X, X2, cat, n_features):
+    ind = np.where(cat)[0]
+    # With mask
+    A, B = _run_one_hot(X, X2, cat)
+    # With indices
+    C, D = _run_one_hot(X, X2, ind)
+    # Check shape
+    assert_equal(A.shape, (2, n_features))
+    assert_equal(B.shape, (1, n_features))
+    assert_equal(C.shape, (2, n_features))
+    assert_equal(D.shape, (1, n_features))
+    # Check that mask and indices give the same results
+    assert_array_equal(toarray(A), toarray(C))
+    assert_array_equal(toarray(B), toarray(D))
+
+
+def test_one_hot_encoder_categorical_features():
+    X = np.array([[3, 2, 1], [0, 1, 1]])
+    X2 = np.array([[1, 1, 1]])
+
+    cat = [True, False, False]
+    _check_one_hot(X, X2, cat, 4)
+
+    # Edge case: all non-categorical
+    cat = [False, False, False]
+    _check_one_hot(X, X2, cat, 3)
+
+    # Edge case: all categorical
+    cat = [True, True, True]
+    _check_one_hot(X, X2, cat, 5)
+
+
+def test_one_hot_encoder_unknown_transform():
+    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
+    y = np.array([[4, 1, 1]])
+
+    # Test that one hot encoder raises error for unknown features
+    # present during transform.
+    oh = OneHotEncoder(handle_unknown='error')
+    oh.fit(X)
+    assert_raises(ValueError, oh.transform, y)
+
+    # Test the ignore option, ignores unknown features.
+    oh = OneHotEncoder(handle_unknown='ignore')
+    oh.fit(X)
+    assert_array_equal(
+        oh.transform(y).toarray(),
+        np.array([[0.,  0.,  0.,  0.,  1.,  0.,  0.]]))
+
+    # Raise error if handle_unknown is neither ignore or error.
+    oh = OneHotEncoder(handle_unknown='42')
+    oh.fit(X)
+    assert_raises(ValueError, oh.transform, y)
+
+
+def test_one_hot_encoder_invalid_handle_missing():
+    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
+    y = np.array([[4, 1, 1]])
+
+    # Test that one hot encoder raises error for unknown features
+    # present during transform.
+    oh = OneHotEncoder(handle_unknown='error', handle_missing='abcde')
+    oh.fit(X)
+    assert_raises(ValueError, oh.transform, y)
+
+
+def test_one_hot_encoder_missing_values_none_handle_missing_passed():
+    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
+    y = np.array([[4, 1, 1]])
+
+    # Test that one hot encoder raises error for unknown features
+    # present during transform.
+    oh = OneHotEncoder(handle_unknown='error', missing_values=None, handle_missing='abcde')
+    oh.fit(X)
+    assert_raises(ValueError, oh.transform, y)
+
+
+def test_one_hot_encoder_handle_missing_all_zeros():
+    pass
+
+
+def test_one_hot_encoder_handle_missing_all_missing():
+    pass
+
+
+def test_one_hot_encoder_handle_missing_category():
+    pass
+
+
+def check_categorical_onehot(X):
+    enc = CategoricalEncoder(encoding='onehot')
+    Xtr1 = enc.fit_transform(X)
+
+    enc = CategoricalEncoder(encoding='onehot-dense')
+    Xtr2 = enc.fit_transform(X)
+
+    assert_allclose(Xtr1.toarray(), Xtr2)
+
+    assert sparse.isspmatrix_csr(Xtr1)
+    return Xtr1.toarray()
+
+
+def test_categorical_encoder_onehot():
+    X = [['abc', 1, 55], ['def', 2, 55]]
+
+    Xtr = check_categorical_onehot(np.array(X)[:, [0]])
+    assert_allclose(Xtr, [[1, 0], [0, 1]])
+
+    Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
+    assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
+
+    Xtr = CategoricalEncoder().fit_transform(X)
+    assert_allclose(Xtr.toarray(), [[1, 0, 1, 0,  1], [0, 1, 0, 1, 1]])
+
+
+def test_categorical_encoder_onehot_inverse():
+    for encoding in ['onehot', 'onehot-dense']:
+        X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
+        enc = CategoricalEncoder(encoding=encoding)
+        X_tr = enc.fit_transform(X)
+        exp = np.array(X, dtype=object)
+        assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+        X = [[2, 55], [1, 55], [3, 55]]
+        enc = CategoricalEncoder(encoding=encoding)
+        X_tr = enc.fit_transform(X)
+        exp = np.array(X)
+        assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+        # with unknown categories
+        X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
+        enc = CategoricalEncoder(encoding=encoding, handle_unknown='ignore',
+                                 categories=[['abc', 'def'], [1, 2],
+                                             [54, 55, 56]])
+        X_tr = enc.fit_transform(X)
+        exp = np.array(X, dtype=object)
+        exp[2, 1] = None
+        assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+        # with an otherwise numerical output, still object if unknown
+        X = [[2, 55], [1, 55], [3, 55]]
+        enc = CategoricalEncoder(encoding=encoding,
+                                 categories=[[1, 2], [54, 56]],
+                                 handle_unknown='ignore')
+        X_tr = enc.fit_transform(X)
+        exp = np.array(X, dtype=object)
+        exp[2, 0] = None
+        exp[:, 1] = None
+        assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+        # incorrect shape raises
+        X_tr = np.array([[0, 1, 1], [1, 0, 1]])
+        msg = re.escape('Shape of the passed X data is not correct')
+        assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
+
+
+def test_categorical_encoder_handle_unknown():
+    X = np.array([[1, 2, 3], [4, 5, 6]])
+    X2 = np.array([[7, 5, 3]])
+
+    # Test that encoder raises error for unknown features during transform.
+    enc = CategoricalEncoder()
+    enc.fit(X)
+    msg = re.escape('unknown categories [7] in column 0')
+    assert_raises_regex(ValueError, msg, enc.transform, X2)
+
+    # With 'ignore' you get all 0's in result
+    enc = CategoricalEncoder(handle_unknown='ignore')
+    enc.fit(X)
+    X2_passed = X2.copy()
+    Xtr = enc.transform(X2_passed)
+    assert_allclose(Xtr.toarray(), [[0, 0, 0, 1, 1, 0]])
+    # ensure transformed data was not modified in place
+    assert_allclose(X2, X2_passed)
+
+    # Invalid option
+    enc = CategoricalEncoder(handle_unknown='invalid')
+    assert_raises(ValueError, enc.fit, X)
+
+
+def test_categorical_encoder_categories():
+    X = [['abc', 1, 55], ['def', 2, 55]]
+
+    # order of categories should not depend on order of samples
+    for Xi in [X, X[::-1]]:
+        enc = CategoricalEncoder()
+        enc.fit(Xi)
+        assert enc.categories == 'auto'
+        assert isinstance(enc.categories_, list)
+        cat_exp = [['abc', 'def'], [1, 2], [55]]
+        for res, exp in zip(enc.categories_, cat_exp):
+            assert res.tolist() == exp
+
+
+def test_categorical_encoder_specified_categories():
+    X = np.array([['a', 'b']], dtype=object).T
+
+    enc = CategoricalEncoder(categories=[['a', 'b', 'c']])
+    exp = np.array([[1., 0., 0.],
+                    [0., 1., 0.]])
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+    assert enc.categories[0] == ['a', 'b', 'c']
+    assert enc.categories_[0].tolist() == ['a', 'b', 'c']
+    assert np.issubdtype(enc.categories_[0].dtype, np.str_)
+
+    # unsorted passed categories raises for now
+    enc = CategoricalEncoder(categories=[['c', 'b', 'a']])
+    msg = re.escape('Unsorted categories are not yet supported')
+    assert_raises_regex(ValueError, msg, enc.fit_transform, X)
+
+    # multiple columns
+    X = np.array([['a', 'b'], [0, 2]], dtype=object).T
+    enc = CategoricalEncoder(categories=[['a', 'b', 'c'], [0, 1, 2]])
+    exp = np.array([[1., 0., 0., 1., 0., 0.],
+                    [0., 1., 0., 0., 0., 1.]])
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+    assert enc.categories_[0].tolist() == ['a', 'b', 'c']
+    assert np.issubdtype(enc.categories_[0].dtype, np.str_)
+    assert enc.categories_[1].tolist() == [0, 1, 2]
+    assert np.issubdtype(enc.categories_[1].dtype, np.integer)
+
+    # when specifying categories manually, unknown categories should already
+    # raise when fitting
+    X = np.array([['a', 'b', 'c']]).T
+    enc = CategoricalEncoder(categories=[['a', 'b']])
+    assert_raises(ValueError, enc.fit, X)
+    enc = CategoricalEncoder(categories=[['a', 'b']], handle_unknown='ignore')
+    exp = np.array([[1., 0.], [0., 1.], [0., 0.]])
+    assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
+
+
+def test_categorical_encoder_pandas():
+    try:
+        import pandas as pd
+    except ImportError:
+        raise SkipTest("pandas is not installed")
+
+    X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
+
+    Xtr = check_categorical_onehot(X_df)
+    assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
+
+
+def test_categorical_encoder_ordinal():
+    X = [['abc', 2, 55], ['def', 1, 55]]
+
+    enc = CategoricalEncoder(encoding='other')
+    assert_raises(ValueError, enc.fit, X)
+
+    enc = CategoricalEncoder(encoding='ordinal', handle_unknown='ignore')
+    assert_raises(ValueError, enc.fit, X)
+
+    enc = CategoricalEncoder(encoding='ordinal')
+    exp = np.array([[0, 1, 0],
+                    [1, 0, 0]], dtype='int64')
+    assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
+    enc = CategoricalEncoder(encoding='ordinal', dtype='int64')
+    assert_array_equal(enc.fit_transform(X), exp)
+
+
+def test_categorical_encoder_ordinal_inverse():
+    X = [['abc', 2, 55], ['def', 1, 55]]
+    enc = CategoricalEncoder(encoding='ordinal')
+    X_tr = enc.fit_transform(X)
+    exp = np.array(X, dtype=object)
+    assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+    # incorrect shape raises
+    X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
+    msg = re.escape('Shape of the passed X data is not correct')
+    assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
+
+
+def test_categorical_encoder_dtypes():
+    # check that dtypes are preserved when determining categories
+    enc = CategoricalEncoder()
+    exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64')
+
+    for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
+              np.array([[1, 2], [3, 4]], dtype='float64'),
+              np.array([['a', 'b'], ['c', 'd']]),  # string dtype
+              np.array([[1, 'a'], [3, 'b']], dtype='object')]:
+        enc.fit(X)
+        assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
+        assert_array_equal(enc.transform(X).toarray(), exp)
+
+    X = [[1, 2], [3, 4]]
+    enc.fit(X)
+    assert all([np.issubdtype(enc.categories_[i].dtype, np.integer)
+                for i in range(2)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+    X = [[1, 'a'], [3, 'b']]
+    enc.fit(X)
+    assert all([enc.categories_[i].dtype == 'object' for i in range(2)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+
+def test_categorical_encoder_dtypes_pandas():
+    # check dtype (similar to test_categorical_encoder_dtypes for dataframes)
+    try:
+        import pandas as pd
+    except ImportError:
+        raise SkipTest("pandas is not installed")
+
+    enc = CategoricalEncoder()
+    exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64')
+
+    X = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}, dtype='int64')
+    enc.fit(X)
+    assert all([enc.categories_[i].dtype == 'int64' for i in range(2)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+    X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b']})
+    enc.fit(X)
+    assert all([enc.categories_[i].dtype == 'object' for i in range(2)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+
+def test_categorical_encoder_warning():
+    enc = CategoricalEncoder()
+    X = [['Male', 1], ['Female', 3]]
+    np.testing.assert_no_warnings(enc.fit_transform, X)
+
+
 def test_fit_cold_start():
     X = iris.data
     X_2d = X[:, :2]
@@ -2004,26 +2305,13 @@ def test_quantile_transform_valid_axis():
                         ". Got axis=2", quantile_transform, X.T, axis=2)
 
 
-@pytest.mark.parametrize("method", ['box-cox', 'yeo-johnson'])
-def test_power_transformer_notfitted(method):
-    pt = PowerTransformer(method=method)
+def test_power_transformer_notfitted():
+    pt = PowerTransformer(method='box-cox')
     X = np.abs(X_1col)
     assert_raises(NotFittedError, pt.transform, X)
     assert_raises(NotFittedError, pt.inverse_transform, X)
 
 
-@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
-@pytest.mark.parametrize('standardize', [True, False])
-@pytest.mark.parametrize('X', [X_1col, X_2d])
-def test_power_transformer_inverse(method, standardize, X):
-    # Make sure we get the original input when applying transform and then
-    # inverse transform
-    X = np.abs(X) if method == 'box-cox' else X
-    pt = PowerTransformer(method=method, standardize=standardize)
-    X_trans = pt.fit_transform(X)
-    assert_almost_equal(X, pt.inverse_transform(X_trans))
-
-
 def test_power_transformer_1d():
     X = np.abs(X_1col)
 
@@ -2075,12 +2363,11 @@ def test_power_transformer_2d():
         assert isinstance(pt.lambdas_, np.ndarray)
 
 
-def test_power_transformer_boxcox_strictly_positive_exception():
-    # Exceptions should be raised for negative arrays and zero arrays when
-    # method is boxcox
-
+def test_power_transformer_strictly_positive_exception():
     pt = PowerTransformer(method='box-cox')
     pt.fit(np.abs(X_2d))
+
+    # Exceptions should be raised for negative arrays and zero arrays
     X_with_negatives = X_2d
     not_positive_message = 'strictly positive'
 
@@ -2091,7 +2378,7 @@ def test_power_transformer_boxcox_strictly_positive_exception():
                          pt.fit, X_with_negatives)
 
     assert_raise_message(ValueError, not_positive_message,
-                         power_transform, X_with_negatives, 'box-cox')
+                         power_transform, X_with_negatives)
 
     assert_raise_message(ValueError, not_positive_message,
                          pt.transform, np.zeros(X_2d.shape))
@@ -2100,19 +2387,11 @@ def test_power_transformer_boxcox_strictly_positive_exception():
                          pt.fit, np.zeros(X_2d.shape))
 
     assert_raise_message(ValueError, not_positive_message,
-                         power_transform, np.zeros(X_2d.shape), 'box-cox')
+                         power_transform, np.zeros(X_2d.shape))
 
 
-@pytest.mark.parametrize('X', [X_2d, np.abs(X_2d), -np.abs(X_2d),
-                               np.zeros(X_2d.shape)])
-def test_power_transformer_yeojohnson_any_input(X):
-    # Yeo-Johnson method should support any kind of input
-    power_transform(X, method='yeo-johnson')
-
-
-@pytest.mark.parametrize("method", ['box-cox', 'yeo-johnson'])
-def test_power_transformer_shape_exception(method):
-    pt = PowerTransformer(method=method)
+def test_power_transformer_shape_exception():
+    pt = PowerTransformer(method='box-cox')
     X = np.abs(X_2d)
     pt.fit(X)
 
@@ -2145,136 +2424,3 @@ def test_power_transformer_lambda_zero():
     pt.lambdas_ = np.array([0])
     X_trans = pt.transform(X)
     assert_array_almost_equal(pt.inverse_transform(X_trans), X)
-
-
-def test_power_transformer_lambda_one():
-    # Make sure lambda = 1 corresponds to the identity for yeo-johnson
-    pt = PowerTransformer(method='yeo-johnson', standardize=False)
-    X = np.abs(X_2d)[:, 0:1]
-
-    pt.lambdas_ = np.array([1])
-    X_trans = pt.transform(X)
-    assert_array_almost_equal(X_trans, X)
-
-
-@pytest.mark.parametrize("method, lmbda", [('box-cox', .1),
-                                           ('box-cox', .5),
-                                           ('yeo-johnson', .1),
-                                           ('yeo-johnson', .5),
-                                           ('yeo-johnson', 1.),
-                                           ])
-def test_optimization_power_transformer(method, lmbda):
-    # Test the optimization procedure:
-    # - set a predefined value for lambda
-    # - apply inverse_transform to a normal dist (we get X_inv)
-    # - apply fit_transform to X_inv (we get X_inv_trans)
-    # - check that X_inv_trans is roughly equal to X
-
-    rng = np.random.RandomState(0)
-    n_samples = 20000
-    X = rng.normal(loc=0, scale=1, size=(n_samples, 1))
-
-    pt = PowerTransformer(method=method, standardize=False)
-    pt.lambdas_ = [lmbda]
-    X_inv = pt.inverse_transform(X)
-
-    pt = PowerTransformer(method=method, standardize=False)
-    X_inv_trans = pt.fit_transform(X_inv)
-
-    assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples,
-                        decimal=2)
-    assert_almost_equal(0, X_inv_trans.mean(), decimal=1)
-    assert_almost_equal(1, X_inv_trans.std(), decimal=1)
-
-
-@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
-def test_power_transformer_nans(method):
-    # Make sure lambda estimation is not influenced by NaN values
-    # and that transform() supports NaN silently
-
-    X = np.abs(X_1col)
-    pt = PowerTransformer(method=method)
-    pt.fit(X)
-    lmbda_no_nans = pt.lambdas_[0]
-
-    # concat nans at the end and check lambda stays the same
-    X = np.concatenate([X, np.full_like(X, np.nan)])
-    X = shuffle(X, random_state=0)
-
-    pt.fit(X)
-    lmbda_nans = pt.lambdas_[0]
-
-    assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5)
-
-    X_trans = pt.transform(X)
-    assert_array_equal(np.isnan(X_trans), np.isnan(X))
-
-
-@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
-@pytest.mark.parametrize('standardize', [True, False])
-def test_power_transformer_fit_transform(method, standardize):
-    # check that fit_transform() and fit().transform() return the same values
-    X = X_1col
-    if method == 'box-cox':
-        X = np.abs(X)
-
-    pt = PowerTransformer(method, standardize)
-    assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X))
-
-
-@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
-@pytest.mark.parametrize('standardize', [True, False])
-def test_power_transformer_copy_True(method, standardize):
-    # Check that neither fit, transform, fit_transform nor inverse_transform
-    # modify X inplace when copy=True
-    X = X_1col
-    if method == 'box-cox':
-        X = np.abs(X)
-
-    X_original = X.copy()
-    assert X is not X_original  # sanity checks
-    assert_array_almost_equal(X, X_original)
-
-    pt = PowerTransformer(method, standardize, copy=True)
-
-    pt.fit(X)
-    assert_array_almost_equal(X, X_original)
-    X_trans = pt.transform(X)
-    assert X_trans is not X
-
-    X_trans = pt.fit_transform(X)
-    assert_array_almost_equal(X, X_original)
-    assert X_trans is not X
-
-    X_inv_trans = pt.inverse_transform(X_trans)
-    assert X_trans is not X_inv_trans
-
-
-@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
-@pytest.mark.parametrize('standardize', [True, False])
-def test_power_transformer_copy_False(method, standardize):
-    # check that when copy=False fit doesn't change X inplace but transform,
-    # fit_transform and inverse_transform do.
-    X = X_1col
-    if method == 'box-cox':
-        X = np.abs(X)
-
-    X_original = X.copy()
-    assert X is not X_original  # sanity checks
-    assert_array_almost_equal(X, X_original)
-
-    pt = PowerTransformer(method, standardize, copy=False)
-
-    pt.fit(X)
-    assert_array_almost_equal(X, X_original)  # fit didn't change X
-
-    X_trans = pt.transform(X)
-    assert X_trans is X
-
-    if method == 'box-cox':
-        X = np.abs(X)
-    X_trans = pt.fit_transform(X)
-    assert X_trans is X
-
-    X_inv_trans = pt.inverse_transform(X_trans)
-    assert X_trans is X_inv_trans