From ea98484484fe76d55a642e58aaa8ec8013ff06b7 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Tue, 29 Mar 2016 22:30:15 -0400
Subject: [PATCH 01/36] Refactored OneHotEncoder to work with strings

---
 doc/modules/preprocessing.rst            |  34 ++-
 sklearn/preprocessing/data.py            | 346 +++++++++++++----------
 sklearn/preprocessing/tests/test_data.py |  66 +++--
 3 files changed, 264 insertions(+), 182 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 709239687158e..24df41f2966fa 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -397,31 +397,37 @@ only one active.
 Continuing the example above::
 
   >>> enc = preprocessing.OneHotEncoder()
-  >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])  # doctest: +ELLIPSIS
+  >>> enc.fit([['female', 'from US', 'uses Chrome'],
+  ... ['male', 'from Asia', 'uses Firefox']])  # doctest: +ELLIPSIS
   OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,
-         handle_unknown='error', n_values='auto', sparse=True)
-  >>> enc.transform([[0, 1, 3]]).toarray()
-  array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])
+         handle_unknown='error', n_values=None, sparse=True, values='auto')
+  >>> enc.transform([['female', 'from Asia', 'uses Firefox']]).toarray()
+  array([[ 1.,  0.,  1.,  0.,  0.,  1.]])
 
 By default, how many values each feature can take is inferred automatically from the dataset.
-It is possible to specify this explicitly using the parameter ``n_values``.
+It is possible to specify this explicitly using the parameter ``xvalues``.
 There are two genders, three possible continents and four web browsers in our
 dataset.
 Then we fit the estimator, and transform a data point.
-In the result, the first two numbers encode the gender, the next set of three
-numbers the continent and the last four the web browser.
+In the result, the first two values are genders, the next set of three
+values are the continents and the last values are web browsers.
 
 Note that, if there is a possibilty that the training data might have missing categorical
 features, one has to explicitly set ``n_values``. For example,
 
-    >>> enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4])
-    >>> # Note that there are missing categorical values for the 2nd and 3rd
-    >>> # features
-    >>> enc.fit([[1, 2, 3], [0, 2, 0]])  # doctest: +ELLIPSIS
+    >>> browsers = ['uses Internet Explorer', 'uses Chrome' , 'uses Safari', 'uses Firefox']
+    >>> genders = ['male', 'female']
+    >>> locations = ['from Europe', 'from Asia', 'from US']
+    >>> enc = preprocessing.OneHotEncoder(values=[genders, locations, browsers])
+    >>> # Note that for there are missing categorical values for the 2nd and 3rd
+    >>> # feature
+    >>> enc.fit([['female', 'from US', 'uses Chrome'],
+    ... ['male', 'from Asia', 'uses Internet Explorer']])  # doctest: +ELLIPSIS
     OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,
-           handle_unknown='error', n_values=[2, 3, 4], sparse=True)
-    >>> enc.transform([[1, 0, 0]]).toarray()
-    array([[ 0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.]])
+           handle_unknown='error', n_values=None, sparse=True,
+	   values=[...])
+    >>> enc.transform([['male', 'from Europe', 'uses Safari']]).toarray()
+    array([[ 0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])
 
 See :ref:`dict_feature_extraction` for categorical features that are represented
 as a dict, not as integers.
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 093137d078000..1fe0741d9db13 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -26,6 +26,8 @@
                                  mean_variance_axis, incr_mean_variance_axis,
                                  min_max_axis)
 from ..utils.validation import check_is_fitted, FLOAT_DTYPES
+from .label import LabelEncoder
+from ..utils.fixes import np_version
 
 
 zip = six.moves.zip
@@ -1618,28 +1620,29 @@ def add_dummy_feature(X, value=1.0):
         return np.hstack((np.ones((n_samples, 1)) * value, X))
 
 
-def _transform_selected(X, transform, selected="all", copy=True):
-    """Apply a transform function to portion of selected features
-
+def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
+                    return_val=True):
+    """Apply a function to portion of selected features
     Parameters
     ----------
-    X : {array-like, sparse matrix}, shape [n_samples, n_features]
+    X : {array, sparse matrix}, shape [n_samples, n_features]
         Dense array or sparse matrix.
-
     transform : callable
         A callable transform(X) -> X_transformed
-
     copy : boolean, optional
         Copy X even if it could be avoided.
-
     selected: "all" or array of indices or mask
         Specify which features to apply the transform to.
-
+    return_val : boolean, optional
+        Whether to return the transformed matrix. If not set `None` is
+        returned.
     Returns
     -------
-    X : array or sparse matrix, shape=(n_samples, n_features_new)
+        X : array or sparse matrix, shape=(n_samples, n_features_new)
     """
-    X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)
+
+    if copy:
+        X = X.copy()
 
     if isinstance(selected, six.string_types) and selected == "all":
         return transform(X)
@@ -1662,22 +1665,22 @@ def _transform_selected(X, transform, selected="all", copy=True):
         return transform(X)
     else:
         X_sel = transform(X[:, ind[sel]])
-        X_not_sel = X[:, ind[not_sel]]
+        X_not_sel = X[:, ind[not_sel]].astype(dtype)
 
-        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
-            return sparse.hstack((X_sel, X_not_sel))
-        else:
-            return np.hstack((X_sel, X_not_sel))
+        if return_val:
+            if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
+                return sparse.hstack((X_sel, X_not_sel))
+            else:
+                return np.hstack((X_sel, X_not_sel))
 
 
 class OneHotEncoder(BaseEstimator, TransformerMixin):
     """Encode categorical integer features using a one-hot aka one-of-K scheme.
 
-    The input to this transformer should be a matrix of integers, denoting
-    the values taken on by categorical (discrete) features. The output will be
-    a sparse matrix where each column corresponds to one possible value of one
-    feature. It is assumed that input features take on values in the range
-    [0, n_values).
+    The input to this transformer should be a matrix of integers or strings,
+    denoting the values taken on by categorical (discrete) features. The
+    output will be a sparse matrix where each column corresponds to one
+    possible value of one feature.
 
     This encoding is needed for feeding categorical data to many scikit-learn
     estimators, notably linear models and SVMs with the standard kernels.
@@ -1689,15 +1692,11 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    n_values : 'auto', int or array of ints
-        Number of values per feature.
-
-        - 'auto' : determine value range from training data.
-        - int : number of categorical values per feature.
-                Each feature value should be in ``range(n_values)``
-        - array : ``n_values[i]`` is the number of categorical values in
-                  ``X[:, i]``. Each feature value should be
-                  in ``range(n_values[i])``
+    values : 'auto', int, list of ints, or list of lists of objects
+        - 'auto' : determine set of values from training data.
+        - int : values are in ``range(values)`` for all features
+        - list of ints : values for feature ``i`` are in ``range(values[i])``
+        - list of lists : values for feature ``i`` are in ``values[i]``
 
     categorical_features : "all" or array of indices or mask
         Specify what features are treated as categorical.
@@ -1720,18 +1719,11 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
-    active_features_ : array
-        Indices for active features, meaning values that actually occur
-        in the training set. Only available when n_values is ``'auto'``.
-
-    feature_indices_ : array of shape (n_features,)
-        Indices to feature ranges.
-        Feature ``i`` in the original data is mapped to features
-        from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
-        (and then potentially masked by `active_features_` afterwards)
-
-    n_values_ : array of shape (n_features,)
-        Maximum number of values per feature.
+    label_encoders_ : list of size n_features.
+        The :class:`sklearn.preprocessing.LabelEncoder` objects used to encode
+        the features. ``self.label_encoders[i]_`` is the LabelEncoder object
+        used to encode the ith column. The unique features found on column
+        ``i`` can be accessed using ``self.label_encoders_[i].classes_``.
 
     Examples
     --------
@@ -1741,16 +1733,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     >>> from sklearn.preprocessing import OneHotEncoder
     >>> enc = OneHotEncoder()
-    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
-[1, 0, 2]])  # doctest: +ELLIPSIS
+    >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) # doctest: +ELLIPSIS
     OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,
-           handle_unknown='error', n_values='auto', sparse=True)
-    >>> enc.n_values_
-    array([2, 3, 4])
-    >>> enc.feature_indices_
-    array([0, 2, 5, 9])
-    >>> enc.transform([[0, 1, 1]]).toarray()
-    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.]])
+           handle_unknown='error', n_values=None, sparse=True, values='auto')
+    >>> list(enc.label_encoders_[0].classes_)
+    ['cat', 'dog', 'mouse']
+    >>> enc.transform([['dog', 4]]).toarray()
+    array([[ 0.,  1.,  0.,  1.,  0.,  0.]])
 
     See also
     --------
@@ -1766,138 +1755,207 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     sklearn.preprocessing.LabelEncoder : encodes labels with values between 0
       and n_classes-1.
     """
-    def __init__(self, n_values="auto", categorical_features="all",
-                 dtype=np.float64, sparse=True, handle_unknown='error'):
-        self.n_values = n_values
+
+    def __init__(self, categorical_features="all", n_values=None,
+                 values='auto', dtype=np.float64, sparse=True,
+                 handle_unknown='error'):
         self.categorical_features = categorical_features
         self.dtype = dtype
         self.sparse = sparse
         self.handle_unknown = handle_unknown
+        self.n_values = n_values
+        self.values = values
 
     def fit(self, X, y=None):
-        """Fit OneHotEncoder to X.
+        """Fit the CategoricalEncoder to X.
 
         Parameters
         ----------
         X : array-like, shape [n_samples, n_feature]
-            Input array of type int.
+            Array of ints or strings or both.
 
         Returns
         -------
         self
         """
-        self.fit_transform(X)
+
+        X = check_array(X, dtype=np.object, accept_sparse='csc')
+        n_samples, n_features = X.shape
+
+        _apply_selected(X, self._fit, dtype=self.dtype,
+                        selected=self.categorical_features, copy=True,
+                        return_val=False)
         return self
 
-    def _fit_transform(self, X):
-        """Assumes X contains only categorical features."""
-        X = check_array(X, dtype=np.int)
-        if np.any(X < 0):
-            raise ValueError("X needs to contain only non-negative integers.")
+    def _fit(self, X):
+        "Assumes `X` contains only cetergorical features."
+
+        X = check_array(X, dtype=np.object)
         n_samples, n_features = X.shape
-        if (isinstance(self.n_values, six.string_types) and
-                self.n_values == 'auto'):
-            n_values = np.max(X, axis=0) + 1
-        elif isinstance(self.n_values, numbers.Integral):
-            if (np.max(X, axis=0) >= self.n_values).any():
-                raise ValueError("Feature out of bounds for n_values=%d"
-                                 % self.n_values)
-            n_values = np.empty(n_features, dtype=np.int)
-            n_values.fill(self.n_values)
-        else:
-            try:
-                n_values = np.asarray(self.n_values, dtype=int)
-            except (ValueError, TypeError):
-                raise TypeError("Wrong type for parameter `n_values`. Expected"
-                                " 'auto', int or array of ints, got %r"
-                                % type(X))
-            if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
-                raise ValueError("Shape mismatch: if n_values is an array,"
-                                 " it has to be of shape (n_features,).")
-
-        self.n_values_ = n_values
-        n_values = np.hstack([[0], n_values])
-        indices = np.cumsum(n_values)
-        self.feature_indices_ = indices
 
-        column_indices = (X + indices[:-1]).ravel()
-        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
-                                n_features)
-        data = np.ones(n_samples * n_features)
-        out = sparse.coo_matrix((data, (row_indices, column_indices)),
-                                shape=(n_samples, indices[-1]),
-                                dtype=self.dtype).tocsr()
+        self._n_features = n_features
+        self.label_encoders_ = [LabelEncoder() for i in range(n_features)]
 
-        if (isinstance(self.n_values, six.string_types) and
-                self.n_values == 'auto'):
-            mask = np.array(out.sum(axis=0)).ravel() != 0
-            active_features = np.where(mask)[0]
-            out = out[:, active_features]
-            self.active_features_ = active_features
+        if self.n_values is not None:
+            warnings.warn('The parameter `n_values` is deprecated, use the'
+                          'parameter `classes_` instead and specify the '
+                          'expected values for each feature')
 
-        return out if self.sparse else out.toarray()
+            if isinstance(self.n_values, numbers.Integral):
+                if (np.max(X, axis=0) >= self.n_values).any():
+                    raise ValueError("Feature out of bounds for n_values=%d"
+                                     % self.n_values)
+                self.values = self.n_values
+            else:
+                try:
+                    n_values = np.asarray(self.n_values, dtype=int)
+                except (ValueError, TypeError):
+                    raise TypeError("Wrong type for parameter `n_values`."
+                                    " Expected 'auto', int or array of ints,"
+                                    "got %r" % type(X))
+                if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
+                    raise ValueError("Shape mismatch: if n_values is an array,"
+                                     " it has to be of shape (n_features,).")
+                self.values = list(self.n_values)
+
+        error_msg = ("`values` should be 'auto', an integer, a list of"
+                     " integers or a list of list")
+
+        for i in range(n_features):
+            le = self.label_encoders_[i]
+            if self.values == 'auto':
+                le.fit(X[:, i])
+            elif isinstance(self.values, numbers.Integral):
+                if (np.max(X, axis=0) >= self.values).any():
+                    raise ValueError("Feature out of bounds for n_values=%d"
+                                     % self.values)
+                le.fit(np.arange(self.values, dtype=np.int))
+            elif isinstance(self.values, list):
+                if len(self.values) != X.shape[1]:
+                    raise ValueError("Shape mismatch: if n_values is a list,"
+                                     " it has to be of length (n_features).")
+                if isinstance(self.values[i], list):
+                    le.fit(self.values[i])
+                elif isinstance(self.values[i], numbers.Integral):
+                    le.fit(np.arange(self.values[i], dtype=np.int))
+                else:
+                    raise ValueError(error_msg)
+            else:
+                raise ValueError(error_msg)
+
+    def transform(self, X, y=None):
+        """Encode the selected categorical features using the one-hot scheme.
 
-    def fit_transform(self, X, y=None):
-        """Fit OneHotEncoder to X, then transform X.
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_feature]
+            Array of ints or strings or both.
 
-        Equivalent to self.fit(X).transform(X), but more convenient and more
-        efficient. See fit for the parameters, transform for the return value.
+        Returns
+        -------
+        out : array, shape[n_samples, n_features_new]
+            `X` encoded using the one-hot scheme.
         """
-        return _transform_selected(X, self._fit_transform,
-                                   self.categorical_features, copy=True)
+        X = check_array(X, dtype=np.object)
+
+        return _apply_selected(X, self._transform, copy=True,
+                               selected=self.categorical_features)
 
     def _transform(self, X):
-        """Assumes X contains only categorical features."""
-        X = check_array(X, dtype=np.int)
-        if np.any(X < 0):
-            raise ValueError("X needs to contain only non-negative integers.")
+        "Assumes `X` contains only categorical features."
+
+        X = check_array(X, accept_sparse='csc', dtype=np.object)
         n_samples, n_features = X.shape
+        X_int = np.zeros_like(X, dtype=np.int)
+        X_mask = np.ones_like(X, dtype=np.bool)
+
+        for i in range(n_features):
+            if np_version < (1, 8):
+                # in1d is not supported for object datatype in np < 1.8
+                valid_mask = np.ones_like(X[:, i], dtype=np.bool)
+                found_classes = set(np.unique(X[:, i]))
+                valid_classes = set(self.label_encoders_[i].classes_)
+                invalid_classes = found_classes - valid_classes
+
+                for item in invalid_classes:
+                    mask = X[:, i] == item
+                    np.logical_not(mask, mask)
+                    np.logical_and(valid_mask, mask, valid_mask)
+
+            else:
+                valid_mask = np.in1d(X[:, i], self.label_encoders_[i].classes_)
+
+            if not np.all(valid_mask):
+
+                if self.handle_unknown == 'error':
+                    if np_version < (1, 8):
+                        valid_classes = set(self.label_encoders_[i].classes_)
+                        diff = set(X[:, i]) - valid_classes
+                        diff = list(diff)
+                    else:
+                        diff = np.setdiff1d(X[:, i],
+                                            self.label_encoders_[i].classes_)
+                    msg = 'Unknown feature(s) %s in column %d' % (diff, i)
+                    raise ValueError(msg)
+                elif self.handle_unknown == 'ignore':
+                    # Set the problematic rows to an acceptable value and
+                    # continue `The rows are marked in `X_mask` and will be
+                    # removed later.
+                    X_mask[:, i] = valid_mask
+                    X[:, i][~valid_mask] = self.label_encoders_[i].classes_[0]
+                else:
+                    template = ("handle_unknown should be either 'error' or "
+                                "'ignore', got %s")
+                    raise ValueError(template % self.handle_unknown)
+
+            X_int[:, i] = self.label_encoders_[i].transform(X[:, i])
 
-        indices = self.feature_indices_
-        if n_features != indices.shape[0] - 1:
-            raise ValueError("X has different shape than during fitting."
-                             " Expected %d, got %d."
-                             % (indices.shape[0] - 1, n_features))
-
-        # We use only those categorical features of X that are known using fit.
-        # i.e lesser than n_values_ using mask.
-        # This means, if self.handle_unknown is "ignore", the row_indices and
-        # col_indices corresponding to the unknown categorical feature are
-        # ignored.
-        mask = (X < self.n_values_).ravel()
-        if np.any(~mask):
-            if self.handle_unknown not in ['error', 'ignore']:
-                raise ValueError("handle_unknown should be either error or "
-                                 "unknown got %s" % self.handle_unknown)
-            if self.handle_unknown == 'error':
-                raise ValueError("unknown categorical feature present %s "
-                                 "during transform." % X.ravel()[~mask])
-
-        column_indices = (X + indices[:-1]).ravel()[mask]
+        mask = X_mask.ravel()
+        n_values = [le.classes_.shape[0] for le in self.label_encoders_]
+        n_values = np.hstack([[0], n_values])
+        indices = np.cumsum(n_values)
+
+        column_indices = (X_int + indices[:-1]).ravel()[mask]
         row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                 n_features)[mask]
-        data = np.ones(np.sum(mask))
+        data = np.ones(n_samples * n_features)[mask]
+
         out = sparse.coo_matrix((data, (row_indices, column_indices)),
                                 shape=(n_samples, indices[-1]),
                                 dtype=self.dtype).tocsr()
+
         if (isinstance(self.n_values, six.string_types) and
                 self.n_values == 'auto'):
             out = out[:, self.active_features_]
 
         return out if self.sparse else out.toarray()
 
-    def transform(self, X):
-        """Transform X using one-hot encoding.
+    @property
+    def active_features_(self):
+        warnings.warn('The property `active_features_` is deprecated and'
+                      ' will be removed in version 0.20')
+        if self.n_values is None:
+            #TODO: What to do when classes are strings ?
+            classes = [le.classes_ for le in self.label_encoders_]
+            classes_max = [np.max(cls) + 1 for cls in classes]
+            cum_idx = np.cumsum([0] + classes_max)
+            active_idx = [self.label_encoders_[i].classes_.astype(np.int)
+                          + cum_idx[i]
+                          for i in range(self._n_features)]
+
+            return np.concatenate(active_idx, axis=0).astype(np.int)
+        else:
+            raise AttributeError()
 
-        Parameters
-        ----------
-        X : array-like, shape [n_samples, n_features]
-            Input array of type int.
+    @property
+    def feature_indices_(self):
+        warnings.warn('The property `feature_indices_` is deprecated and'
+                      ' will be removed in version 0.20')
+        classes_max = [np.max(le.classes_) + 1 for le in self.label_encoders_]
+        return np.cumsum([0] + classes_max)
 
-        Returns
-        -------
-        X_out : sparse matrix if sparse=True else a 2-d array, dtype=int
-            Transformed input.
-        """
-        return _transform_selected(X, self._transform,
-                                   self.categorical_features, copy=True)
+    @property
+    def n_values_(self):
+        warnings.warn('The property `n_values_` is deprecated and'
+                      ' will be removed in version 0.20')
+        return np.array([le.classes_.shape[0] for le in self.label_encoders_])
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 7a51049b60242..13bcf6c8e04ac 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -6,6 +6,7 @@
 # License: BSD 3 clause
 
 import warnings
+import re
 import numpy as np
 import numpy.linalg as la
 from scipy import sparse
@@ -31,7 +32,7 @@
 from sklearn.utils.testing import skip_if_32bit
 
 from sklearn.utils.sparsefuncs import mean_variance_axis
-from sklearn.preprocessing.data import _transform_selected
+from sklearn.preprocessing.data import _apply_selected
 from sklearn.preprocessing.data import _handle_zeros_in_scale
 from sklearn.preprocessing.data import Binarizer
 from sklearn.preprocessing.data import KernelCenterer
@@ -1488,9 +1489,10 @@ def test_one_hot_encoder_sparse():
     # test that an error is raised when out of bounds:
     X_too_large = [[0, 2, 1], [0, 1, 1]]
     assert_raises(ValueError, enc.transform, X_too_large)
-    error_msg = "unknown categorical feature present \[2\] during transform."
+    error_msg = re.escape("Unknown feature(s) [2] in column 1")
     assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large)
     assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X)
+    assert_raises(ValueError, OneHotEncoder(values=2).fit_transform, X)
 
     # test that error is raised when wrong number of features
     assert_raises(ValueError, enc.transform, X[:, :-1])
@@ -1500,14 +1502,6 @@ def test_one_hot_encoder_sparse():
     # test exception on wrong init param
     assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)
 
-    enc = OneHotEncoder()
-    # test negative input to fit
-    assert_raises(ValueError, enc.fit, [[0], [-1]])
-
-    # test negative input to transform
-    enc.fit([[0], [1]])
-    assert_raises(ValueError, enc.transform, [[0], [-1]])
-
 
 def test_one_hot_encoder_dense():
     # check for sparse=False
@@ -1526,26 +1520,26 @@ def test_one_hot_encoder_dense():
                                  [1., 0., 1., 0., 1.]]))
 
 
-def _check_transform_selected(X, X_expected, sel):
+def _check_apply_selected(X, X_expected, sel):
     for M in (X, sparse.csr_matrix(X)):
-        Xtr = _transform_selected(M, Binarizer().transform, sel)
+        Xtr = _apply_selected(M, Binarizer().transform, sel)
         assert_array_equal(toarray(Xtr), X_expected)
 
 
 def test_transform_selected():
-    X = [[3, 2, 1], [0, 1, 1]]
+    X = np.array([[3, 2, 1], [0, 1, 1]])
 
     X_expected = [[1, 2, 1], [0, 1, 1]]
-    _check_transform_selected(X, X_expected, [0])
-    _check_transform_selected(X, X_expected, [True, False, False])
+    _check_apply_selected(X, X_expected, [0])
+    _check_apply_selected(X, X_expected, [True, False, False])
 
     X_expected = [[1, 1, 1], [0, 1, 1]]
-    _check_transform_selected(X, X_expected, [0, 1, 2])
-    _check_transform_selected(X, X_expected, [True, True, True])
-    _check_transform_selected(X, X_expected, "all")
+    _check_apply_selected(X, X_expected, [0, 1, 2])
+    _check_apply_selected(X, X_expected, [True, True, True])
+    _check_apply_selected(X, X_expected, "all")
 
-    _check_transform_selected(X, X, [])
-    _check_transform_selected(X, X, [False, False, False])
+    _check_apply_selected(X, X, [])
+    _check_apply_selected(X, X, [False, False, False])
 
 
 def test_transform_selected_copy_arg():
@@ -1558,8 +1552,8 @@ def _mutating_transformer(X):
     expected_Xtr = [[2, 2], [3, 4]]
 
     X = original_X.copy()
-    Xtr = _transform_selected(X, _mutating_transformer, copy=True,
-                              selected='all')
+    Xtr = _apply_selected(X, _mutating_transformer, copy=True,
+                          selected='all')
 
     assert_array_equal(toarray(X), toarray(original_X))
     assert_array_equal(toarray(Xtr), expected_Xtr)
@@ -1588,9 +1582,17 @@ def _check_one_hot(X, X2, cat, n_features):
     assert_array_equal(toarray(B), toarray(D))
 
 
+def test_one_hot_encoder_string():
+    X = [['cat', 'domestic'], ['wolf', 'wild']]
+    enc = OneHotEncoder()
+    enc.fit(X)
+    Xtr = enc.transform([['cat', 'wild']])
+    assert_array_equal(toarray(Xtr), [[1, 0, 0, 1]])
+
+
 def test_one_hot_encoder_categorical_features():
     X = np.array([[3, 2, 1], [0, 1, 1]])
-    X2 = np.array([[1, 1, 1]])
+    X2 = np.array([[3, 1, 1]])
 
     cat = [True, False, False]
     _check_one_hot(X, X2, cat, 4)
@@ -1621,7 +1623,23 @@ def test_one_hot_encoder_unknown_transform():
         oh.transform(y).toarray(),
         np.array([[0.,  0.,  0.,  0.,  1.,  0.,  0.]]))
 
-    # Raise error if handle_unknown is neither ignore or error.
+    X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]])
+    y = np.array([['ET', 1, 1]])
+
+    # Test that one hot encoder raises error for unknown features
+    # present during transform.
+    oh = OneHotEncoder(handle_unknown='error')
+    oh.fit(X)
+    assert_raises(ValueError, oh.transform, y)
+
+    # Test the ignore option, ignores unknown features.
+    oh = OneHotEncoder(handle_unknown='ignore')
+    oh.fit(X)
+    assert_array_equal(
+        oh.transform(y).toarray(),
+        np.array([[0.,  0.,  0., 0.,  0.,  1.,  0.,  0.]]))
+
+    # Raise error if handle_unknown is neither ignore nor error.
     oh = OneHotEncoder(handle_unknown='42')
     oh.fit(X)
     assert_raises(ValueError, oh.transform, y)

From e03b5c7e3e0be4b53c4d9b22deaab3a51557f87b Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Mon, 2 May 2016 11:45:26 -0400
Subject: [PATCH 02/36] ported functions to fixes.py

---
 sklearn/preprocessing/data.py | 32 +++++--------------------
 sklearn/utils/fixes.py        | 44 +++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 1fe0741d9db13..421e4d66073e5 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -27,7 +27,7 @@
                                  min_max_axis)
 from ..utils.validation import check_is_fitted, FLOAT_DTYPES
 from .label import LabelEncoder
-from ..utils.fixes import np_version
+from ..utils.fixes import in1d, setdiff1d
 
 
 zip = six.moves.zip
@@ -1651,10 +1651,8 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
         return X
 
     n_features = X.shape[1]
-    ind = np.arange(n_features)
     sel = np.zeros(n_features, dtype=bool)
     sel[np.asarray(selected)] = True
-    not_sel = np.logical_not(sel)
     n_selected = np.sum(sel)
 
     if n_selected == 0:
@@ -1664,8 +1662,8 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
         # All features selected.
         return transform(X)
     else:
-        X_sel = transform(X[:, ind[sel]])
-        X_not_sel = X[:, ind[not_sel]].astype(dtype)
+        X_sel = transform(X[:, sel])
+        X_not_sel = X[:, ~sel].astype(dtype)
 
         if return_val:
             if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
@@ -1788,7 +1786,7 @@ def fit(self, X, y=None):
         return self
 
     def _fit(self, X):
-        "Assumes `X` contains only cetergorical features."
+        "Assumes `X` contains only catergorical features."
 
         X = check_array(X, dtype=np.object)
         n_samples, n_features = X.shape
@@ -1870,31 +1868,13 @@ def _transform(self, X):
         X_mask = np.ones_like(X, dtype=np.bool)
 
         for i in range(n_features):
-            if np_version < (1, 8):
-                # in1d is not supported for object datatype in np < 1.8
-                valid_mask = np.ones_like(X[:, i], dtype=np.bool)
-                found_classes = set(np.unique(X[:, i]))
-                valid_classes = set(self.label_encoders_[i].classes_)
-                invalid_classes = found_classes - valid_classes
-
-                for item in invalid_classes:
-                    mask = X[:, i] == item
-                    np.logical_not(mask, mask)
-                    np.logical_and(valid_mask, mask, valid_mask)
 
-            else:
-                valid_mask = np.in1d(X[:, i], self.label_encoders_[i].classes_)
+            valid_mask = in1d(X[:, i], self.label_encoders_[i].classes_)
 
             if not np.all(valid_mask):
 
                 if self.handle_unknown == 'error':
-                    if np_version < (1, 8):
-                        valid_classes = set(self.label_encoders_[i].classes_)
-                        diff = set(X[:, i]) - valid_classes
-                        diff = list(diff)
-                    else:
-                        diff = np.setdiff1d(X[:, i],
-                                            self.label_encoders_[i].classes_)
+                    diff = setdiff1d(X[:, i], self.label_encoders_[i].classes_)
                     msg = 'Unknown feature(s) %s in column %d' % (diff, i)
                     raise ValueError(msg)
                 elif self.handle_unknown == 'ignore':
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index d789d5f525cd4..08c7edc3c28a1 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -217,13 +217,32 @@ def frombuffer_empty(buf, dtype):
     frombuffer_empty = np.frombuffer
 
 
+def _in1d_object(ar1, ar2, invert=False):
+    # np.argsort(kind='mergesort') is only supported for object types after
+    # version 1.8. Hence in1d for object arrays needs to be handled differently
+    values1 = set(ar1)
+    values2 = set(ar2)
+    abset_values = values1 - values2
+
+    present = np.ones_like(ar1, dtype=np.bool)
+
+    for value in abset_values:
+        present[ar1 == value] = False
+
+    return ~present if invert else present
+
+
 if np_version < (1, 8):
     def in1d(ar1, ar2, assume_unique=False, invert=False):
         # Backport of numpy function in1d 1.8.1 to support numpy 1.6.2
         # Ravel both arrays, behavior for the first array could be different
+
         ar1 = np.asarray(ar1).ravel()
         ar2 = np.asarray(ar2).ravel()
 
+        if ar1.dtype == object or ar2.dtype == object:
+            return _in1d_object(ar1, ar2, invert)
+
         # This code is significantly faster when the condition is satisfied.
         if len(ar2) < 10 * len(ar1) ** 0.145:
             if invert:
@@ -408,3 +427,28 @@ def norm(X, ord=None, axis=None):
 
 else:
     norm = np.linalg.norm
+
+
+if np_version < (1, 8):
+    # Backport of setdiff1d function as it relies on in1d
+    def setdiff1d(ar1, ar2, assume_unique=False):
+        # copy-paste from numpy except for the object type if clause
+        if assume_unique:
+            ar1 = np.asarray(ar1).ravel()
+        else:
+            # Unique is not supported for object arrays till np version 1.8
+            # due to mergesort
+            if ar1.dtype == object:
+                ar1 = np.array(set(ar1))
+            else:
+                ar1 = np.unique(ar1)
+
+            if ar2.dtype == object:
+                ar2 = np.array(set(ar2))
+            else:
+                ar1 = np.unique(ar2)
+
+            return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
+
+else:
+    from numpy import setdiff1d

From 06e6d3adb272bc99943d50eff0a17f0aaa50623b Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Mon, 2 May 2016 14:01:45 -0400
Subject: [PATCH 03/36] unique arrays are now sorted

---
 sklearn/utils/fixes.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 08c7edc3c28a1..9c50b38bee4ef 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -222,11 +222,11 @@ def _in1d_object(ar1, ar2, invert=False):
     # version 1.8. Hence in1d for object arrays needs to be handled differently
     values1 = set(ar1)
     values2 = set(ar2)
-    abset_values = values1 - values2
+    absent_values = values1 - values2
 
     present = np.ones_like(ar1, dtype=np.bool)
 
-    for value in abset_values:
+    for value in absent_values:
         present[ar1 == value] = False
 
     return ~present if invert else present
@@ -439,16 +439,16 @@ def setdiff1d(ar1, ar2, assume_unique=False):
             # Unique is not supported for object arrays till np version 1.8
             # due to mergesort
             if ar1.dtype == object:
-                ar1 = np.array(set(ar1))
+                ar1 = np.array(sorted(set(ar1)))
             else:
                 ar1 = np.unique(ar1)
 
             if ar2.dtype == object:
-                ar2 = np.array(set(ar2))
+                ar2 = np.array(sorted(set(ar2)))
             else:
-                ar1 = np.unique(ar2)
+                ar2 = np.unique(ar2)
 
-            return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
+        return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
 
 else:
     from numpy import setdiff1d

From 074f194e096000b243fb21dcf81cc7da1fc76e5d Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Mon, 2 May 2016 14:59:58 -0400
Subject: [PATCH 04/36] revert selection logic

---
 sklearn/preprocessing/data.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 421e4d66073e5..eee4fe96bf12a 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1651,8 +1651,10 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
         return X
 
     n_features = X.shape[1]
+    ind = np.arange(n_features)
     sel = np.zeros(n_features, dtype=bool)
     sel[np.asarray(selected)] = True
+    not_sel = np.logical_not(sel)
     n_selected = np.sum(sel)
 
     if n_selected == 0:
@@ -1662,8 +1664,8 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
         # All features selected.
         return transform(X)
     else:
-        X_sel = transform(X[:, sel])
-        X_not_sel = X[:, ~sel].astype(dtype)
+        X_sel = transform(X[:, ind[sel]])
+        X_not_sel = X[:, ind[not_sel]].astype(dtype)
 
         if return_val:
             if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):

From 083142ed381bad23b97c395016a5f5d014a891e8 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Mon, 2 May 2016 16:49:41 -0400
Subject: [PATCH 05/36] Added copy argument

---
 sklearn/preprocessing/data.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index eee4fe96bf12a..139f3cac202fd 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1717,6 +1717,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
         Whether to raise an error or ignore if a unknown categorical feature is
         present during transform.
 
+    copy : bool, default=True
+        If unset, `X` maybe modified in space.
+
     Attributes
     ----------
     label_encoders_ : list of size n_features.
@@ -1756,15 +1759,16 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
       and n_classes-1.
     """
 
-    def __init__(self, categorical_features="all", n_values=None,
-                 values='auto', dtype=np.float64, sparse=True,
-                 handle_unknown='error'):
+    def __init__(self, values='auto', categorical_features="all",
+                 n_values=None, dtype=np.float64, sparse=True,
+                 handle_unknown='error', copy=True):
+        self.values = values
         self.categorical_features = categorical_features
         self.dtype = dtype
         self.sparse = sparse
         self.handle_unknown = handle_unknown
         self.n_values = n_values
-        self.values = values
+        self.copy = copy
 
     def fit(self, X, y=None):
         """Fit the CategoricalEncoder to X.
@@ -1779,7 +1783,7 @@ def fit(self, X, y=None):
         self
         """
 
-        X = check_array(X, dtype=np.object, accept_sparse='csc')
+        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=self.copy)
         n_samples, n_features = X.shape
 
         _apply_selected(X, self._fit, dtype=self.dtype,

From f768f3bee800d72cff7315800b159cef3e68405c Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Wed, 31 Aug 2016 11:43:13 -0400
Subject: [PATCH 06/36] Inbetween adding the seen option

---
 sklearn/preprocessing/data.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 139f3cac202fd..ddeac726bc8bc 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1692,8 +1692,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    values : 'auto', int, list of ints, or list of lists of objects
-        - 'auto' : determine set of values from training data.
+    values : 'auto', 'seen', int, list of ints, or list of lists of objects
+        - 'auto' : determine set of values from training data. If the input
+                   is an int array, values are determined from range in
+                   training data. For all other inputs, only values observed
+                   during `fit` are considered valid values for each feature.
+        - 'seen': Only values observed during `fit` are considered valid
+                  values for each feature.
         - int : values are in ``range(values)`` for all features
         - list of ints : values for feature ``i`` are in ``range(values[i])``
         - list of lists : values for feature ``i`` are in ``values[i]``
@@ -1783,7 +1788,8 @@ def fit(self, X, y=None):
         self
         """
 
-        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=self.copy)
+        X = check_array(X, dtype=np.object, accept_sparse='csc',
+                        copy=self.copy)
         n_samples, n_features = X.shape
 
         _apply_selected(X, self._fit, dtype=self.dtype,
@@ -1828,6 +1834,8 @@ def _fit(self, X):
         for i in range(n_features):
             le = self.label_encoders_[i]
             if self.values == 'auto':
+                le.fit(np.arange(1 + np.max(X[:, i])))
+            elif self.values == 'seen':
                 le.fit(X[:, i])
             elif isinstance(self.values, numbers.Integral):
                 if (np.max(X, axis=0) >= self.values).any():

From 1e34caef4a7682f8240503de52007b885b054641 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Thu, 1 Sep 2016 13:31:35 -0400
Subject: [PATCH 07/36] remove seen argument and support range case with
 FutureWarning

---
 sklearn/preprocessing/data.py            | 46 ++++++++++++++++--------
 sklearn/preprocessing/tests/test_data.py | 15 ++++++--
 2 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index ddeac726bc8bc..43c24f0a5f6c8 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1693,12 +1693,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     Parameters
     ----------
     values : 'auto', 'seen', int, list of ints, or list of lists of objects
-        - 'auto' : determine set of values from training data. If the input
-                   is an int array, values are determined from range in
-                   training data. For all other inputs, only values observed
-                   during `fit` are considered valid values for each feature.
-        - 'seen': Only values observed during `fit` are considered valid
-                  values for each feature.
+        - 'auto' : determine set of values from training data. See the
+          documentation of `handle_unknown` for which values are considered
+          acceptable.
         - int : values are in ``range(values)`` for all features
         - list of ints : values for feature ``i`` are in ``range(values[i])``
         - list of lists : values for feature ``i`` are in ``values[i]``
@@ -1719,8 +1716,12 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
         Will return sparse matrix if set True else will return an array.
 
     handle_unknown : str, 'error' or 'ignore'
-        Whether to raise an error or ignore if a unknown categorical feature is
-        present during transform.
+
+        - 'ignore': Ignore all unknown feature values.
+        - 'error': Raise an error when the value of a feature is unseen during
+          `fit` and out of range of values seen during `fit`.
+        - 'error-strict': Raise an error when the value of a feature is unseen
+          during`fit`.
 
     copy : bool, default=True
         If unset, `X` maybe modified in space.
@@ -1805,6 +1806,8 @@ def _fit(self, X):
 
         self._n_features = n_features
         self.label_encoders_ = [LabelEncoder() for i in range(n_features)]
+        # Maximum value for each featue
+        self._max_values = [None for i in range(n_features)]
 
         if self.n_values is not None:
             warnings.warn('The parameter `n_values` is deprecated, use the'
@@ -1833,9 +1836,9 @@ def _fit(self, X):
 
         for i in range(n_features):
             le = self.label_encoders_[i]
+
+            self._max_values[i] = np.max(X[:, i])
             if self.values == 'auto':
-                le.fit(np.arange(1 + np.max(X[:, i])))
-            elif self.values == 'seen':
                 le.fit(X[:, i])
             elif isinstance(self.values, numbers.Integral):
                 if (np.max(X, axis=0) >= self.values).any():
@@ -1886,14 +1889,27 @@ def _transform(self, X):
             valid_mask = in1d(X[:, i], self.label_encoders_[i].classes_)
 
             if not np.all(valid_mask):
-
-                if self.handle_unknown == 'error':
+                if self.handle_unknown in ['error', 'error-strict']:
                     diff = setdiff1d(X[:, i], self.label_encoders_[i].classes_)
-                    msg = 'Unknown feature(s) %s in column %d' % (diff, i)
-                    raise ValueError(msg)
+                    if self.handle_unknown == 'error-strict':
+                        msg = 'Unknown feature(s) %s in column %d' % (diff, i)
+                        raise ValueError(msg)
+                    else:
+                        if np.all(diff <= self._max_values[i]):
+                            msg = ('Values %s for feature %d are unknown but '
+                                   'in range. This will raise an error in '
+                                   'future versions.' % (str(diff), i))
+                            warnings.warn(FutureWarning(msg))
+                            X_mask[:, i] = valid_mask
+                            le = self.label_encoders_[i]
+                            X[:, i][~valid_mask] = le.classes_[0]
+                        else:
+                            msg = ('Unknown feature(s) %s in column %d' %
+                                   (diff, i))
+                            raise ValueError(msg)
                 elif self.handle_unknown == 'ignore':
                     # Set the problematic rows to an acceptable value and
-                    # continue `The rows are marked in `X_mask` and will be
+                    # continue. The rows are marked in `X_mask` and will be
                     # removed later.
                     X_mask[:, i] = valid_mask
                     X[:, i][~valid_mask] = self.label_encoders_[i].classes_[0]
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 13bcf6c8e04ac..a576a3058a9c9 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1592,7 +1592,7 @@ def test_one_hot_encoder_string():
 
 def test_one_hot_encoder_categorical_features():
     X = np.array([[3, 2, 1], [0, 1, 1]])
-    X2 = np.array([[3, 1, 1]])
+    X2 = np.array([[1, 1, 1]])
 
     cat = [True, False, False]
     _check_one_hot(X, X2, cat, 4)
@@ -1612,7 +1612,7 @@ def test_one_hot_encoder_unknown_transform():
 
     # Test that one hot encoder raises error for unknown features
     # present during transform.
-    oh = OneHotEncoder(handle_unknown='error')
+    oh = OneHotEncoder(handle_unknown='error-strict')
     oh.fit(X)
     assert_raises(ValueError, oh.transform, y)
 
@@ -1628,10 +1628,19 @@ def test_one_hot_encoder_unknown_transform():
 
     # Test that one hot encoder raises error for unknown features
     # present during transform.
-    oh = OneHotEncoder(handle_unknown='error')
+    oh = OneHotEncoder(handle_unknown='error-strict')
     oh.fit(X)
     assert_raises(ValueError, oh.transform, y)
 
+    # Test that one hot encoder raises warning for unknown but in range
+    # features
+    oh = OneHotEncoder(handle_unknown='error')
+    oh.fit(X)
+    msg = ('Values [0] for feature 2 are unknown but in range. '
+           'This will raise an error in future versions.')
+    assert_warns_message(FutureWarning, msg, oh.transform,
+                         np.array([[0, 0, 0]]))
+
     # Test the ignore option, ignores unknown features.
     oh = OneHotEncoder(handle_unknown='ignore')
     oh.fit(X)

From fed795953e93c7e8c7646b5ba51948fcbdaddaae Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Fri, 2 Sep 2016 04:37:10 -0400
Subject: [PATCH 08/36] Made label_encoders_ private

---
 sklearn/preprocessing/data.py | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 43c24f0a5f6c8..9ba3dc4b3e572 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1623,6 +1623,7 @@ def add_dummy_feature(X, value=1.0):
 def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
                     return_val=True):
     """Apply a function to portion of selected features
+
     Parameters
     ----------
     X : {array, sparse matrix}, shape [n_samples, n_features]
@@ -1636,9 +1637,10 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
     return_val : boolean, optional
         Whether to return the transformed matrix. If not set `None` is
         returned.
+
     Returns
     -------
-        X : array or sparse matrix, shape=(n_samples, n_features_new)
+    X : array or sparse matrix, shape=(n_samples, n_features_new)
     """
 
     if copy:
@@ -1728,11 +1730,6 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
-    label_encoders_ : list of size n_features.
-        The :class:`sklearn.preprocessing.LabelEncoder` objects used to encode
-        the features. ``self.label_encoders[i]_`` is the LabelEncoder object
-        used to encode the ith column. The unique features found on column
-        ``i`` can be accessed using ``self.label_encoders_[i].classes_``.
 
     Examples
     --------
@@ -1805,7 +1802,7 @@ def _fit(self, X):
         n_samples, n_features = X.shape
 
         self._n_features = n_features
-        self.label_encoders_ = [LabelEncoder() for i in range(n_features)]
+        self._label_encoders = [LabelEncoder() for i in range(n_features)]
         # Maximum value for each featue
         self._max_values = [None for i in range(n_features)]
 
@@ -1835,7 +1832,7 @@ def _fit(self, X):
                      " integers or a list of list")
 
         for i in range(n_features):
-            le = self.label_encoders_[i]
+            le = self._label_encoders[i]
 
             self._max_values[i] = np.max(X[:, i])
             if self.values == 'auto':
@@ -1886,11 +1883,12 @@ def _transform(self, X):
 
         for i in range(n_features):
 
-            valid_mask = in1d(X[:, i], self.label_encoders_[i].classes_)
+            valid_mask = in1d(X[:, i], self._label_encoders[i].classes_)
 
             if not np.all(valid_mask):
                 if self.handle_unknown in ['error', 'error-strict']:
-                    diff = setdiff1d(X[:, i], self.label_encoders_[i].classes_)
+                    le = self._label_encoders[i]
+                    diff = setdiff1d(X[:, i], le.classes_)
                     if self.handle_unknown == 'error-strict':
                         msg = 'Unknown feature(s) %s in column %d' % (diff, i)
                         raise ValueError(msg)
@@ -1901,7 +1899,7 @@ def _transform(self, X):
                                    'future versions.' % (str(diff), i))
                             warnings.warn(FutureWarning(msg))
                             X_mask[:, i] = valid_mask
-                            le = self.label_encoders_[i]
+                            le = self._label_encoders[i]
                             X[:, i][~valid_mask] = le.classes_[0]
                         else:
                             msg = ('Unknown feature(s) %s in column %d' %
@@ -1912,16 +1910,16 @@ def _transform(self, X):
                     # continue. The rows are marked in `X_mask` and will be
                     # removed later.
                     X_mask[:, i] = valid_mask
-                    X[:, i][~valid_mask] = self.label_encoders_[i].classes_[0]
+                    X[:, i][~valid_mask] = self._label_encoders[i].classes_[0]
                 else:
                     template = ("handle_unknown should be either 'error' or "
                                 "'ignore', got %s")
                     raise ValueError(template % self.handle_unknown)
 
-            X_int[:, i] = self.label_encoders_[i].transform(X[:, i])
+            X_int[:, i] = self._label_encoders[i].transform(X[:, i])
 
         mask = X_mask.ravel()
-        n_values = [le.classes_.shape[0] for le in self.label_encoders_]
+        n_values = [le.classes_.shape[0] for le in self._label_encoders]
         n_values = np.hstack([[0], n_values])
         indices = np.cumsum(n_values)
 
@@ -1946,10 +1944,10 @@ def active_features_(self):
                       ' will be removed in version 0.20')
         if self.n_values is None:
             #TODO: What to do when classes are strings ?
-            classes = [le.classes_ for le in self.label_encoders_]
+            classes = [le.classes_ for le in self._label_encoders]
             classes_max = [np.max(cls) + 1 for cls in classes]
             cum_idx = np.cumsum([0] + classes_max)
-            active_idx = [self.label_encoders_[i].classes_.astype(np.int)
+            active_idx = [self._label_encoders[i].classes_.astype(np.int)
                           + cum_idx[i]
                           for i in range(self._n_features)]
 
@@ -1961,11 +1959,11 @@ def active_features_(self):
     def feature_indices_(self):
         warnings.warn('The property `feature_indices_` is deprecated and'
                       ' will be removed in version 0.20')
-        classes_max = [np.max(le.classes_) + 1 for le in self.label_encoders_]
+        classes_max = [np.max(le.classes_) + 1 for le in self._label_encoders]
         return np.cumsum([0] + classes_max)
 
     @property
     def n_values_(self):
         warnings.warn('The property `n_values_` is deprecated and'
                       ' will be removed in version 0.20')
-        return np.array([le.classes_.shape[0] for le in self.label_encoders_])
+        return np.array([le.classes_.shape[0] for le in self._label_encoders])

From c62d2badc4ec1ca25138cd23fbd79fb05ced2b72 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Fri, 2 Sep 2016 06:27:30 -0400
Subject: [PATCH 09/36] Added new attributes and tests for OHE

---
 sklearn/preprocessing/data.py            | 46 ++++++++++++++++++++++++
 sklearn/preprocessing/tests/test_data.py | 19 ++++++++++
 2 files changed, 65 insertions(+)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 9ba3dc4b3e572..cad7b61d92a5a 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1730,6 +1730,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
+    feature_index_range_ : array, shape [n_feature, 2]
+        `feature_index_range_[i]` specifies the range of column indices
+        occupied by the feature `i` in the one-hot encoded array.
+
+    one_hot_feature_index_ : array, shape [n_features_new]
+        `one_hot_feature_index_[i]` specifies which feature of the input
+        is encoded by column `i` in the one-hot encoded array.
 
     Examples
     --------
@@ -1793,6 +1800,45 @@ def fit(self, X, y=None):
         _apply_selected(X, self._fit, dtype=self.dtype,
                         selected=self.categorical_features, copy=True,
                         return_val=False)
+
+        self.feature_index_range_ = np.zeros((n_features, 2), dtype=np.int)
+
+        if (isinstance(self.categorical_features, six.string_types) and
+            self.categorical_features == "all"):
+            categorical = np.ones(n_features, dtype=bool)
+        else:
+            categorical = np.zeros(n_features, dtype=bool)
+            categorical[np.asarray(self.categorical_features)] = True
+
+        num_cat = np.sum(categorical)
+        start = 0
+        cat_index = 0
+        #print(categorical, self.categorical_features)
+        for i in range(n_features):
+            if categorical[i]:
+                le = self._label_encoders[cat_index]
+                end = start + len(le.classes_)
+                self.feature_index_range_[i] = start, end
+                start += len(le.classes_)
+                cat_index += 1
+
+        indices = np.arange(start, start + n_features - num_cat)
+        self.feature_index_range_[~categorical, 0] = indices
+        indices += 1
+        self.feature_index_range_[~categorical, 1] = indices
+
+        if len(indices) > 0:
+            output_cols = indices[-1]
+        else:
+            output_cols = start
+
+        print(output_cols)
+        self.one_hot_feature_index_ = np.empty(output_cols, dtype=np.int)
+
+        for i in range(n_features):
+            s, e = self.feature_index_range_[i]
+            self.one_hot_feature_index_[s:e] = i
+
         return self
 
     def _fit(self, X):
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index a576a3058a9c9..9bb183528c963 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1503,6 +1503,25 @@ def test_one_hot_encoder_sparse():
     assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)
 
 
+def test_one_hot_encoder_attr():
+    X = [[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]]
+
+    enc = OneHotEncoder()
+    enc.fit(X)
+    assert_array_equal(enc.feature_index_range_, [[0, 3], [3, 5], [5, 7]])
+    assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 1, 1, 2, 2])
+
+    enc = OneHotEncoder(categorical_features=[True, False, True])
+    enc.fit(X)
+    assert_array_equal(enc.feature_index_range_, [[0, 3], [5, 6], [3, 5]])
+    assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 2, 2, 1])
+
+    enc = OneHotEncoder(categorical_features=[False, False, True])
+    enc.fit(X)
+    assert_array_equal(enc.feature_index_range_, [[2, 3], [3, 4], [0, 2]])
+    assert_array_equal(enc.one_hot_feature_index_, [2, 2, 0, 1])
+
+
 def test_one_hot_encoder_dense():
     # check for sparse=False
     X = [[3, 2, 1], [0, 1, 1]]

From e929f23838651773ee4a6e675be2acb3f8cb748d Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Fri, 2 Sep 2016 06:39:04 -0400
Subject: [PATCH 10/36] Fixed doctests

---
 doc/modules/preprocessing.rst |  2 +-
 sklearn/preprocessing/data.py | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 24df41f2966fa..c624cb836dc07 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -405,7 +405,7 @@ Continuing the example above::
   array([[ 1.,  0.,  1.,  0.,  0.,  1.]])
 
 By default, how many values each feature can take is inferred automatically from the dataset.
-It is possible to specify this explicitly using the parameter ``xvalues``.
+It is possible to specify this explicitly using the parameter ``values``.
 There are two genders, three possible continents and four web browsers in our
 dataset.
 Then we fit the estimator, and transform a data point.
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index cad7b61d92a5a..121d4b8b7e9cd 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1747,10 +1747,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     >>> from sklearn.preprocessing import OneHotEncoder
     >>> enc = OneHotEncoder()
     >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) # doctest: +ELLIPSIS
-    OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,
-           handle_unknown='error', n_values=None, sparse=True, values='auto')
-    >>> list(enc.label_encoders_[0].classes_)
-    ['cat', 'dog', 'mouse']
+    OneHotEncoder(categorical_features='all', copy=True,
+           dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
+           sparse=True, values='auto')
     >>> enc.transform([['dog', 4]]).toarray()
     array([[ 0.,  1.,  0.,  1.,  0.,  0.]])
 
@@ -1813,7 +1812,7 @@ def fit(self, X, y=None):
         num_cat = np.sum(categorical)
         start = 0
         cat_index = 0
-        #print(categorical, self.categorical_features)
+
         for i in range(n_features):
             if categorical[i]:
                 le = self._label_encoders[cat_index]
@@ -1832,7 +1831,6 @@ def fit(self, X, y=None):
         else:
             output_cols = start
 
-        print(output_cols)
         self.one_hot_feature_index_ = np.empty(output_cols, dtype=np.int)
 
         for i in range(n_features):

From bc7a26bfa7c30c719c185c4b903c3426c8878801 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Fri, 2 Sep 2016 08:07:00 -0400
Subject: [PATCH 11/36] Fixed rst doc tests

---
 doc/modules/preprocessing.rst | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index c624cb836dc07..9b4068c4710e1 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -399,8 +399,9 @@ Continuing the example above::
   >>> enc = preprocessing.OneHotEncoder()
   >>> enc.fit([['female', 'from US', 'uses Chrome'],
   ... ['male', 'from Asia', 'uses Firefox']])  # doctest: +ELLIPSIS
-  OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,
-         handle_unknown='error', n_values=None, sparse=True, values='auto')
+  OneHotEncoder(categorical_features='all', copy=True,
+         dtype=<type 'numpy.float64'>, handle_unknown='error', n_values=None,
+         sparse=True, values='auto')
   >>> enc.transform([['female', 'from Asia', 'uses Firefox']]).toarray()
   array([[ 1.,  0.,  1.,  0.,  0.,  1.]])
 
@@ -423,9 +424,11 @@ features, one has to explicitly set ``n_values``. For example,
     >>> # feature
     >>> enc.fit([['female', 'from US', 'uses Chrome'],
     ... ['male', 'from Asia', 'uses Internet Explorer']])  # doctest: +ELLIPSIS
-    OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,
-           handle_unknown='error', n_values=None, sparse=True,
-	   values=[...])
+    OneHotEncoder(categorical_features='all', copy=True,
+           dtype=<type 'numpy.float64'>, handle_unknown='error', n_values=None,
+           sparse=True,
+           values=[...])
+
     >>> enc.transform([['male', 'from Europe', 'uses Safari']]).toarray()
     array([[ 0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])
 

From feaf0148033ac5d9f5c3b34edcc7e1512ae0d6b3 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Fri, 2 Sep 2016 08:09:41 -0400
Subject: [PATCH 12/36] Replaced type in array with ellipsis

---
 doc/modules/preprocessing.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 9b4068c4710e1..68edaf934ac18 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -400,7 +400,7 @@ Continuing the example above::
   >>> enc.fit([['female', 'from US', 'uses Chrome'],
   ... ['male', 'from Asia', 'uses Firefox']])  # doctest: +ELLIPSIS
   OneHotEncoder(categorical_features='all', copy=True,
-         dtype=<type 'numpy.float64'>, handle_unknown='error', n_values=None,
+         dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
          sparse=True, values='auto')
   >>> enc.transform([['female', 'from Asia', 'uses Firefox']]).toarray()
   array([[ 1.,  0.,  1.,  0.,  0.,  1.]])
@@ -425,7 +425,7 @@ features, one has to explicitly set ``n_values``. For example,
     >>> enc.fit([['female', 'from US', 'uses Chrome'],
     ... ['male', 'from Asia', 'uses Internet Explorer']])  # doctest: +ELLIPSIS
     OneHotEncoder(categorical_features='all', copy=True,
-           dtype=<type 'numpy.float64'>, handle_unknown='error', n_values=None,
+           dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
            sparse=True,
            values=[...])
 

From 7b608e12651a83d7a12b0165bc4c4011d96117ba Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Fri, 2 Sep 2016 08:32:06 -0400
Subject: [PATCH 13/36] flake fixes

---
 sklearn/preprocessing/data.py | 10 +++++-----
 sklearn/utils/fixes.py        |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 121d4b8b7e9cd..37478070ae2e8 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1802,8 +1802,9 @@ def fit(self, X, y=None):
 
         self.feature_index_range_ = np.zeros((n_features, 2), dtype=np.int)
 
-        if (isinstance(self.categorical_features, six.string_types) and
-            self.categorical_features == "all"):
+        if isinstance(self.categorical_features, six.string_types) and \
+           self.categorical_features == "all":
+
             categorical = np.ones(n_features, dtype=bool)
         else:
             categorical = np.zeros(n_features, dtype=bool)
@@ -1987,12 +1988,11 @@ def active_features_(self):
         warnings.warn('The property `active_features_` is deprecated and'
                       ' will be removed in version 0.20')
         if self.n_values is None:
-            #TODO: What to do when classes are strings ?
             classes = [le.classes_ for le in self._label_encoders]
             classes_max = [np.max(cls) + 1 for cls in classes]
             cum_idx = np.cumsum([0] + classes_max)
-            active_idx = [self._label_encoders[i].classes_.astype(np.int)
-                          + cum_idx[i]
+            active_idx = [self._label_encoders[i].classes_.astype(np.int) +
+                          cum_idx[i]
                           for i in range(self._n_features)]
 
             return np.concatenate(active_idx, axis=0).astype(np.int)
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 9c50b38bee4ef..fe37e9469c720 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -451,4 +451,4 @@ def setdiff1d(ar1, ar2, assume_unique=False):
         return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
 
 else:
-    from numpy import setdiff1d
+    from numpy import setdiff1d  # noqa

From 5f305d827a494db76ce7d9ef41e492e8233f4ae6 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Fri, 2 Sep 2016 10:36:36 -0400
Subject: [PATCH 14/36] Add NORMALIZE_WHITESPACE for python3 tests

---
 sklearn/preprocessing/data.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 37478070ae2e8..b0071353b2163 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1746,8 +1746,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     >>> from sklearn.preprocessing import OneHotEncoder
     >>> enc = OneHotEncoder()
-    >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) # doctest: +ELLIPSIS
-    OneHotEncoder(categorical_features='all', copy=True,
+    >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) \
+        # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+        OneHotEncoder(categorical_features='all', copy=True,
            dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
            sparse=True, values='auto')
     >>> enc.transform([['dog', 4]]).toarray()

From 1392292679628ce1a2f4302ed9366ace76e745a4 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Fri, 2 Sep 2016 10:53:09 -0400
Subject: [PATCH 15/36] normalize whitespace for rst docs

---
 doc/modules/preprocessing.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 68edaf934ac18..622489c19ba13 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -423,7 +423,8 @@ features, one has to explicitly set ``n_values``. For example,
     >>> # Note that for there are missing categorical values for the 2nd and 3rd
     >>> # feature
     >>> enc.fit([['female', 'from US', 'uses Chrome'],
-    ... ['male', 'from Asia', 'uses Internet Explorer']])  # doctest: +ELLIPSIS
+    ... ['male', 'from Asia', 'uses Internet Explorer']]) \
+    ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
     OneHotEncoder(categorical_features='all', copy=True,
            dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
            sparse=True,

From 50d23607f2f3be389f011456a0280ceaa632d5c0 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Fri, 2 Sep 2016 11:06:42 -0400
Subject: [PATCH 16/36] normalizing whitespace again

---
 doc/modules/preprocessing.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 622489c19ba13..f1bfba00dde01 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -398,7 +398,8 @@ Continuing the example above::
 
   >>> enc = preprocessing.OneHotEncoder()
   >>> enc.fit([['female', 'from US', 'uses Chrome'],
-  ... ['male', 'from Asia', 'uses Firefox']])  # doctest: +ELLIPSIS
+  ... ['male', 'from Asia', 'uses Firefox']])  \
+  ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
   OneHotEncoder(categorical_features='all', copy=True,
          dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
          sparse=True, values='auto')

From 8f2f1d39b7e50c58d2a2454939a26df2fd8c1cd4 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Tue, 6 Sep 2016 14:03:25 -0400
Subject: [PATCH 17/36] docstring changes and minor optimizations

---
 sklearn/preprocessing/data.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index b0071353b2163..9f5d4ea205c28 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1694,7 +1694,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    values : 'auto', 'seen', int, list of ints, or list of lists of objects
+    values : 'auto', int, list of ints, or list of lists of objects
         - 'auto' : determine set of values from training data. See the
           documentation of `handle_unknown` for which values are considered
           acceptable.
@@ -1731,11 +1731,11 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     Attributes
     ----------
     feature_index_range_ : array, shape [n_feature, 2]
-        `feature_index_range_[i]` specifies the range of column indices
-        occupied by the feature `i` in the one-hot encoded array.
+        ``feature_index_range_[i]`` specifies the range of column indices
+        occupied by the input feature `i` in the one-hot encoded array.
 
     one_hot_feature_index_ : array, shape [n_features_new]
-        `one_hot_feature_index_[i]` specifies which feature of the input
+        ``one_hot_feature_index_[i]`` specifies which feature of the input
         is encoded by column `i` in the one-hot encoded array.
 
     Examples
@@ -1820,7 +1820,7 @@ def fit(self, X, y=None):
                 le = self._label_encoders[cat_index]
                 end = start + len(le.classes_)
                 self.feature_index_range_[i] = start, end
-                start += len(le.classes_)
+                start = end
                 cat_index += 1
 
         indices = np.arange(start, start + n_features - num_cat)
@@ -1844,7 +1844,8 @@ def fit(self, X, y=None):
     def _fit(self, X):
         "Assumes `X` contains only catergorical features."
 
-        X = check_array(X, dtype=np.object)
+        if not np.issubdtype(X.dtype.type, np.integer):
+            X = check_array(X, dtype=np.object)
         n_samples, n_features = X.shape
 
         self._n_features = n_features
@@ -1854,7 +1855,7 @@ def _fit(self, X):
 
         if self.n_values is not None:
             warnings.warn('The parameter `n_values` is deprecated, use the'
-                          'parameter `classes_` instead and specify the '
+                          'parameter `values` instead and specify the '
                           'expected values for each feature')
 
             if isinstance(self.n_values, numbers.Integral):

From 1c8accfa89881b6c3c4bece0f166115758876a95 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Wed, 28 Dec 2016 13:45:28 +0530
Subject: [PATCH 18/36] Made tests pass by creating arrays with object dtype

---
 sklearn/preprocessing/tests/test_data.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 9bb183528c963..512866a5475f0 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1642,8 +1642,9 @@ def test_one_hot_encoder_unknown_transform():
         oh.transform(y).toarray(),
         np.array([[0.,  0.,  0.,  0.,  1.,  0.,  0.]]))
 
-    X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]])
-    y = np.array([['ET', 1, 1]])
+    X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]],
+                 dtype=np.object)
+    y = np.array([['ET', 1, 1]], dtype=np.object)
 
     # Test that one hot encoder raises error for unknown features
     # present during transform.
@@ -1658,7 +1659,7 @@ def test_one_hot_encoder_unknown_transform():
     msg = ('Values [0] for feature 2 are unknown but in range. '
            'This will raise an error in future versions.')
     assert_warns_message(FutureWarning, msg, oh.transform,
-                         np.array([[0, 0, 0]]))
+                         np.array([['mouse', 0, 0]], dtype=np.object))
 
     # Test the ignore option, ignores unknown features.
     oh = OneHotEncoder(handle_unknown='ignore')

From 6edda8b8bea06b1a71ba691c62849d1214c69194 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Wed, 28 Dec 2016 15:17:57 +0530
Subject: [PATCH 19/36] Assign both values and n_values to self._values and
 remove redundant checking

---
 sklearn/preprocessing/data.py            | 67 +++++++++++-------------
 sklearn/preprocessing/tests/test_data.py | 11 ++--
 2 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 9f5d4ea205c28..533ef8e020f92 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1720,8 +1720,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     handle_unknown : str, 'error' or 'ignore'
 
         - 'ignore': Ignore all unknown feature values.
-        - 'error': Raise an error when the value of a feature is unseen during
-          `fit` and out of range of values seen during `fit`.
+        - 'error': Raise an error when the value of a feature is more than the
+          maximum value seen during fit.
         - 'error-strict': Raise an error when the value of a feature is unseen
           during`fit`.
 
@@ -1851,29 +1851,17 @@ def _fit(self, X):
         self._n_features = n_features
         self._label_encoders = [LabelEncoder() for i in range(n_features)]
         # Maximum value for each featue
-        self._max_values = [None for i in range(n_features)]
+        self._max_values = [None] * n_features
 
         if self.n_values is not None:
-            warnings.warn('The parameter `n_values` is deprecated, use the'
+            warnings.warn('`n_values` has been renamed to `values`.'
+                          'The parameter `n_values` is deprecated, use the'
                           'parameter `values` instead and specify the '
                           'expected values for each feature')
 
-            if isinstance(self.n_values, numbers.Integral):
-                if (np.max(X, axis=0) >= self.n_values).any():
-                    raise ValueError("Feature out of bounds for n_values=%d"
-                                     % self.n_values)
-                self.values = self.n_values
-            else:
-                try:
-                    n_values = np.asarray(self.n_values, dtype=int)
-                except (ValueError, TypeError):
-                    raise TypeError("Wrong type for parameter `n_values`."
-                                    " Expected 'auto', int or array of ints,"
-                                    "got %r" % type(X))
-                if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
-                    raise ValueError("Shape mismatch: if n_values is an array,"
-                                     " it has to be of shape (n_features,).")
-                self.values = list(self.n_values)
+            self._values = self.n_values
+        else:
+            self._values = self.values
 
         error_msg = ("`values` should be 'auto', an integer, a list of"
                      " integers or a list of list")
@@ -1882,25 +1870,32 @@ def _fit(self, X):
             le = self._label_encoders[i]
 
             self._max_values[i] = np.max(X[:, i])
-            if self.values == 'auto':
+
+            if isinstance(self._values, numbers.Integral):
+                self._values = np.ones(n_features, dtype=np.int) * self._values
+
+            if self._values == 'auto':
                 le.fit(X[:, i])
-            elif isinstance(self.values, numbers.Integral):
-                if (np.max(X, axis=0) >= self.values).any():
-                    raise ValueError("Feature out of bounds for n_values=%d"
-                                     % self.values)
-                le.fit(np.arange(self.values, dtype=np.int))
-            elif isinstance(self.values, list):
-                if len(self.values) != X.shape[1]:
-                    raise ValueError("Shape mismatch: if n_values is a list,"
+
+            elif (isinstance(self._values, list) or
+                  isinstance(self._values, np.ndarray)):
+                if len(self._values) != X.shape[1]:
+                    raise ValueError("Shape mismatch: if values is a list,"
                                      " it has to be of length (n_features).")
-                if isinstance(self.values[i], list):
-                    le.fit(self.values[i])
-                elif isinstance(self.values[i], numbers.Integral):
-                    le.fit(np.arange(self.values[i], dtype=np.int))
+                if isinstance(self._values[i], list):
+                    le.fit(self._values[i])
+                elif np.isscalar(self._values[i]):
+                    le.fit(np.arange(self._values[i], dtype=np.int))
+                    X_feature_max = np.max(X, axis=0)
+                    mask = X_feature_max >= self._values
+                    if mask.any():
+                        msg = 'Value(s) %s out of bounds for feature(s) %s'
+                        raise ValueError(msg % (X_feature_max[mask],
+                                                np.where(mask)[0]))
                 else:
                     raise ValueError(error_msg)
             else:
-                raise ValueError(error_msg)
+                raise TypeError(error_msg)
 
     def transform(self, X, y=None):
         """Encode the selected categorical features using the one-hot scheme.
@@ -1943,7 +1938,9 @@ def _transform(self, X):
                         if np.all(diff <= self._max_values[i]):
                             msg = ('Values %s for feature %d are unknown but '
                                    'in range. This will raise an error in '
-                                   'future versions.' % (str(diff), i))
+                                   'future versions where "error-strict" will '
+                                   'be default for `handle_unknown` parameter'
+                                   % (str(diff), i))
                             warnings.warn(FutureWarning(msg))
                             X_mask[:, i] = valid_mask
                             le = self._label_encoders[i]
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 512866a5475f0..f451176ce4eeb 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1491,8 +1491,12 @@ def test_one_hot_encoder_sparse():
     assert_raises(ValueError, enc.transform, X_too_large)
     error_msg = re.escape("Unknown feature(s) [2] in column 1")
     assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large)
-    assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X)
-    assert_raises(ValueError, OneHotEncoder(values=2).fit_transform, X)
+
+    error_msg = re.escape("Value(s) [2] out of bounds for feature(s) [0]")
+    assert_raises_regex(ValueError, error_msg,
+                        OneHotEncoder(n_values=2).fit_transform, X)
+    assert_raises_regex(ValueError, error_msg,
+                        OneHotEncoder(values=2).fit_transform, X)
 
     # test that error is raised when wrong number of features
     assert_raises(ValueError, enc.transform, X[:, :-1])
@@ -1657,7 +1661,8 @@ def test_one_hot_encoder_unknown_transform():
     oh = OneHotEncoder(handle_unknown='error')
     oh.fit(X)
     msg = ('Values [0] for feature 2 are unknown but in range. '
-           'This will raise an error in future versions.')
+           'This will raise an error in future versions where "error-strict"'
+           ' will be default for `handle_unknown` parameter')
     assert_warns_message(FutureWarning, msg, oh.transform,
                          np.array([['mouse', 0, 0]], dtype=np.object))
 

From 1d2ca1aea5fb8d9a5b6689739750c55f98366001 Mon Sep 17 00:00:00 2001
From: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
Date: Wed, 28 Dec 2016 16:38:54 +0530
Subject: [PATCH 20/36] removed extra spaces for flake8 compat

---
 sklearn/preprocessing/tests/test_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index f451176ce4eeb..7752231e66e70 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1671,7 +1671,7 @@ def test_one_hot_encoder_unknown_transform():
     oh.fit(X)
     assert_array_equal(
         oh.transform(y).toarray(),
-        np.array([[0.,  0.,  0., 0.,  0.,  1.,  0.,  0.]]))
+        np.array([[0., 0., 0., 0., 0., 1., 0., 0.]]))
 
     # Raise error if handle_unknown is neither ignore nor error.
     oh = OneHotEncoder(handle_unknown='42')

From 93ae49e1a6aefaf955a2919028cfbf31a930218a Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Tue, 25 Apr 2017 13:08:17 -0500
Subject: [PATCH 21/36] REF Refactor OHE and avoid copies

Refactor the OneHotEncoder for easier reading. Avoid mandatory copies of input data in both the `fit` and `transform` steps. Add a test that the input data aren't modified after fitting or transforming.
---
 sklearn/preprocessing/data.py            | 252 ++++++++++++-----------
 sklearn/preprocessing/tests/test_data.py |  35 +++-
 2 files changed, 160 insertions(+), 127 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 533ef8e020f92..99c7f16025c48 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -19,7 +19,7 @@
 from ..utils import check_array
 from ..utils.extmath import row_norms
 from ..utils.extmath import _incremental_mean_and_var
-from ..utils.fixes import bincount
+from ..utils.fixes import bincount, sparse_min_max
 from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
                                       inplace_csr_row_normalize_l2)
 from ..utils.sparsefuncs import (inplace_column_scale,
@@ -1642,7 +1642,6 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
     -------
     X : array or sparse matrix, shape=(n_samples, n_features_new)
     """
-
     if copy:
         X = X.copy()
 
@@ -1653,7 +1652,6 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
         return X
 
     n_features = X.shape[1]
-    ind = np.arange(n_features)
     sel = np.zeros(n_features, dtype=bool)
     sel[np.asarray(selected)] = True
     not_sel = np.logical_not(sel)
@@ -1666,10 +1664,10 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
         # All features selected.
         return transform(X)
     else:
-        X_sel = transform(X[:, ind[sel]])
-        X_not_sel = X[:, ind[not_sel]].astype(dtype)
+        X_sel = transform(X[:, sel])
 
         if return_val:
+            X_not_sel = X[:, not_sel].astype(dtype)
             if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
                 return sparse.hstack((X_sel, X_not_sel))
             else:
@@ -1717,7 +1715,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     sparse : boolean, default=True
         Will return sparse matrix if set True else will return an array.
 
-    handle_unknown : str, 'error' or 'ignore'
+    handle_unknown : str, 'error', 'error-strict', or 'ignore'
 
         - 'ignore': Ignore all unknown feature values.
         - 'error': Raise an error when the value of a feature is more than the
@@ -1725,9 +1723,6 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
         - 'error-strict': Raise an error when the value of a feature is unseen
           during`fit`.
 
-    copy : bool, default=True
-        If unset, `X` maybe modified in space.
-
     Attributes
     ----------
     feature_index_range_ : array, shape [n_feature, 2]
@@ -1748,7 +1743,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     >>> enc = OneHotEncoder()
     >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) \
         # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-        OneHotEncoder(categorical_features='all', copy=True,
+        OneHotEncoder(categorical_features='all',
            dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
            sparse=True, values='auto')
     >>> enc.transform([['dog', 4]]).toarray()
@@ -1768,20 +1763,18 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     sklearn.preprocessing.LabelEncoder : encodes labels with values between 0
       and n_classes-1.
     """
-
     def __init__(self, values='auto', categorical_features="all",
                  n_values=None, dtype=np.float64, sparse=True,
-                 handle_unknown='error', copy=True):
+                 handle_unknown='error'):
         self.values = values
         self.categorical_features = categorical_features
         self.dtype = dtype
         self.sparse = sparse
         self.handle_unknown = handle_unknown
         self.n_values = n_values
-        self.copy = copy
 
     def fit(self, X, y=None):
-        """Fit the CategoricalEncoder to X.
+        """Fit the OneHotEncoder to X.
 
         Parameters
         ----------
@@ -1792,13 +1785,11 @@ def fit(self, X, y=None):
         -------
         self
         """
-
-        X = check_array(X, dtype=np.object, accept_sparse='csc',
-                        copy=self.copy)
+        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=False)
         n_samples, n_features = X.shape
 
         _apply_selected(X, self._fit, dtype=self.dtype,
-                        selected=self.categorical_features, copy=True,
+                        selected=self.categorical_features, copy=False,
                         return_val=False)
 
         self.feature_index_range_ = np.zeros((n_features, 2), dtype=np.int)
@@ -1812,7 +1803,7 @@ def fit(self, X, y=None):
             categorical[np.asarray(self.categorical_features)] = True
 
         num_cat = np.sum(categorical)
-        start = 0
+        start, end = 0, 0
         cat_index = 0
 
         for i in range(n_features):
@@ -1828,12 +1819,8 @@ def fit(self, X, y=None):
         indices += 1
         self.feature_index_range_[~categorical, 1] = indices
 
-        if len(indices) > 0:
-            output_cols = indices[-1]
-        else:
-            output_cols = start
-
-        self.one_hot_feature_index_ = np.empty(output_cols, dtype=np.int)
+        n_expanded_cols = end + n_features - num_cat
+        self.one_hot_feature_index_ = np.empty(n_expanded_cols, dtype=np.int)
 
         for i in range(n_features):
             s, e = self.feature_index_range_[i]
@@ -1841,61 +1828,110 @@ def fit(self, X, y=None):
 
         return self
 
-    def _fit(self, X):
-        "Assumes `X` contains only catergorical features."
+    def _check_values(self, values, n_features, max_values):
+        """Verify that the input `values` is valid
+
+        Raises ValueError or TypeError for bad `values`.
+        """
+        error_msg = ("`values` should be 'auto', an integer, a list of"
+                     " integers or a list of list")
+        if isinstance(values, six.string_types):
+            # Input "auto": determine values automatically
+            if values != 'auto':
+                raise ValueError(error_msg)
+        elif (isinstance(values, list) or
+                isinstance(values, np.ndarray)):
+            if len(values) != n_features:
+                raise ValueError("Shape mismatch: if values is a list,"
+                                 " it has to be of length (n_features).")
+
+            # Either all entries are scalars or none are
+            scalar_vals = [np.isscalar(val) for val in values]
+            if not (all(scalar_vals) or not any(scalar_vals)):
+                raise ValueError(error_msg)
+        elif not np.isscalar(values):
+            raise TypeError(error_msg)
+
+        # Validate input data against user-supplied categories
+        if not np.isscalar(values) and np.isscalar(values[0]):
+            too_big = np.zeros(n_features, dtype=bool)
+            for i_col in range(n_features):
+                if not np.isfinite(max_values[i_col]):
+                    # String features; don't bounds-check
+                    continue
+                if max_values[i_col] >= values[i_col]:
+                    too_big[i_col] = True
+
+            if too_big.any():
+                msg = 'Value(s) %s out of bounds for feature(s) %s'
+                raise ValueError(msg % (max_values[too_big],
+                                        np.where(too_big)[0]))
+
+    def _check_features_greater_than_zero(self, X):
+        """Raise a ValueError if X has numerical values less than 0"""
+        if sparse.issparse(X):
+            min_values, _ = sparse_min_max(X, axis=0)
+        else:
+            min_values = np.min(X, axis=0)
+        lt_zero = np.zeros(X.shape[1], dtype=bool)
+        for i_value, value in enumerate(min_values):
+            if isinstance(value, six.string_types):
+                continue
+            elif value < 0:
+                lt_zero[i_value] = True
+
+        if np.any(lt_zero):
+            raise ValueError('Column(s) %s have numerical values less '
+                             'than zero.', np.where(lt_zero)[0])
 
-        if not np.issubdtype(X.dtype.type, np.integer):
-            X = check_array(X, dtype=np.object)
+    def _fit(self, X):
+        """Assumes `X` contains only categorical features"""
         n_samples, n_features = X.shape
 
         self._n_features = n_features
         self._label_encoders = [LabelEncoder() for i in range(n_features)]
-        # Maximum value for each featue
-        self._max_values = [None] * n_features
+        self._set_max_values(X)
+        self._check_features_greater_than_zero(X)
 
+        # Set up and check user-input categories.
         if self.n_values is not None:
             warnings.warn('`n_values` has been renamed to `values`.'
-                          'The parameter `n_values` is deprecated, use the'
+                          'The parameter `n_values` has been deprecated '
+                          'and will be removed in version 0.21, use the'
                           'parameter `values` instead and specify the '
                           'expected values for each feature')
-
             self._values = self.n_values
         else:
             self._values = self.values
+        if (not isinstance(self._values, six.string_types) and
+                np.isscalar(self._values)):
+            # Expect all categoricals to be integers with max `values`
+            self._values = np.ones(n_features, dtype=np.int) * self._values
+        self._check_values(self._values, n_features, self._max_values)
 
-        error_msg = ("`values` should be 'auto', an integer, a list of"
-                     " integers or a list of list")
-
+        # Fit on categorical features in the data
         for i in range(n_features):
             le = self._label_encoders[i]
 
-            self._max_values[i] = np.max(X[:, i])
-
-            if isinstance(self._values, numbers.Integral):
-                self._values = np.ones(n_features, dtype=np.int) * self._values
-
-            if self._values == 'auto':
+            if np.isscalar(self._values) and self._values == 'auto':
                 le.fit(X[:, i])
-
-            elif (isinstance(self._values, list) or
-                  isinstance(self._values, np.ndarray)):
-                if len(self._values) != X.shape[1]:
-                    raise ValueError("Shape mismatch: if values is a list,"
-                                     " it has to be of length (n_features).")
+            else:
                 if isinstance(self._values[i], list):
                     le.fit(self._values[i])
                 elif np.isscalar(self._values[i]):
                     le.fit(np.arange(self._values[i], dtype=np.int))
-                    X_feature_max = np.max(X, axis=0)
-                    mask = X_feature_max >= self._values
-                    if mask.any():
-                        msg = 'Value(s) %s out of bounds for feature(s) %s'
-                        raise ValueError(msg % (X_feature_max[mask],
-                                                np.where(mask)[0]))
-                else:
-                    raise ValueError(error_msg)
-            else:
-                raise TypeError(error_msg)
+
+    def _set_max_values(self, X):
+        """Inspect input data to determine the maximum value in each column"""
+        if sparse.issparse(X):
+            min_values, max_values = sparse_min_max(X, axis=0)
+        else:
+            max_values = np.max(X, axis=0)
+        self._max_values = np.zeros(len(max_values)) + np.nan
+        for i_value, value in enumerate(max_values):
+            if isinstance(value, six.string_types):
+                continue
+            self._max_values[i_value] = value
 
     def transform(self, X, y=None):
         """Encode the selected categorical features using the one-hot scheme.
@@ -1910,57 +1946,47 @@ def transform(self, X, y=None):
         out : array, shape[n_samples, n_features_new]
             `X` encoded using the one-hot scheme.
         """
-        X = check_array(X, dtype=np.object)
+        if self.handle_unknown not in ['ignore', 'error', 'error-strict']:
+            template = ("handle_unknown should be either 'error', "
+                        "'error-strict', or 'ignore', got %s")
+            raise ValueError(template % self.handle_unknown)
 
-        return _apply_selected(X, self._transform, copy=True,
+        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=False)
+
+        return _apply_selected(X, self._transform, copy=False,
                                selected=self.categorical_features)
 
     def _transform(self, X):
-        "Assumes `X` contains only categorical features."
-
-        X = check_array(X, accept_sparse='csc', dtype=np.object)
+        """Assumes `X` contains only categorical features."""
         n_samples, n_features = X.shape
         X_int = np.zeros_like(X, dtype=np.int)
         X_mask = np.ones_like(X, dtype=np.bool)
 
         for i in range(n_features):
-
-            valid_mask = in1d(X[:, i], self._label_encoders[i].classes_)
-
+            le = self._label_encoders[i]
+            valid_mask = in1d(X[:, i], le.classes_)
             if not np.all(valid_mask):
                 if self.handle_unknown in ['error', 'error-strict']:
-                    le = self._label_encoders[i]
                     diff = setdiff1d(X[:, i], le.classes_)
-                    if self.handle_unknown == 'error-strict':
+                    if (self.handle_unknown == 'error-strict' or
+                            np.isfinite(self._max_values[i]) and
+                            np.any(diff >= self._max_values[i]) or
+                            np.any(diff < 0)):
                         msg = 'Unknown feature(s) %s in column %d' % (diff, i)
                         raise ValueError(msg)
                     else:
-                        if np.all(diff <= self._max_values[i]):
-                            msg = ('Values %s for feature %d are unknown but '
-                                   'in range. This will raise an error in '
-                                   'future versions where "error-strict" will '
-                                   'be default for `handle_unknown` parameter'
-                                   % (str(diff), i))
-                            warnings.warn(FutureWarning(msg))
-                            X_mask[:, i] = valid_mask
-                            le = self._label_encoders[i]
-                            X[:, i][~valid_mask] = le.classes_[0]
-                        else:
-                            msg = ('Unknown feature(s) %s in column %d' %
-                                   (diff, i))
-                            raise ValueError(msg)
-                elif self.handle_unknown == 'ignore':
-                    # Set the problematic rows to an acceptable value and
-                    # continue. The rows are marked in `X_mask` and will be
-                    # removed later.
-                    X_mask[:, i] = valid_mask
-                    X[:, i][~valid_mask] = self._label_encoders[i].classes_[0]
-                else:
-                    template = ("handle_unknown should be either 'error' or "
-                                "'ignore', got %s")
-                    raise ValueError(template % self.handle_unknown)
-
-            X_int[:, i] = self._label_encoders[i].transform(X[:, i])
+                        msg = ('Values %s for feature %d are unknown but '
+                               'in range. This will raise an error in '
+                               'future versions where "error-strict" will '
+                               'be default for `handle_unknown` parameter'
+                               % (str(diff), i))
+                        warnings.warn(FutureWarning(msg))
+
+                X_mask[:, i] = valid_mask
+                X_int[valid_mask, i] = (self._label_encoders[i]
+                                        .transform(X[valid_mask, i]))
+            else:
+                X_int[:, i] = self._label_encoders[i].transform(X[:, i])
 
         mask = X_mask.ravel()
         n_values = [le.classes_.shape[0] for le in self._label_encoders]
@@ -1970,43 +1996,37 @@ def _transform(self, X):
         column_indices = (X_int + indices[:-1]).ravel()[mask]
         row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                 n_features)[mask]
-        data = np.ones(n_samples * n_features)[mask]
+        data = np.ones(np.sum(mask))
 
         out = sparse.coo_matrix((data, (row_indices, column_indices)),
                                 shape=(n_samples, indices[-1]),
                                 dtype=self.dtype).tocsr()
 
-        if (isinstance(self.n_values, six.string_types) and
-                self.n_values == 'auto'):
-            out = out[:, self.active_features_]
-
         return out if self.sparse else out.toarray()
 
     @property
     def active_features_(self):
         warnings.warn('The property `active_features_` is deprecated and'
-                      ' will be removed in version 0.20')
-        if self.n_values is None:
-            classes = [le.classes_ for le in self._label_encoders]
-            classes_max = [np.max(cls) + 1 for cls in classes]
-            cum_idx = np.cumsum([0] + classes_max)
-            active_idx = [self._label_encoders[i].classes_.astype(np.int) +
-                          cum_idx[i]
-                          for i in range(self._n_features)]
-
-            return np.concatenate(active_idx, axis=0).astype(np.int)
-        else:
-            raise AttributeError()
+                      ' will be removed in version 0.21')
+        n_features_out = sum([len(le.classes_) for le in self._label_encoders])
+        return np.arange(n_features_out)
 
     @property
     def feature_indices_(self):
         warnings.warn('The property `feature_indices_` is deprecated and'
-                      ' will be removed in version 0.20')
-        classes_max = [np.max(le.classes_) + 1 for le in self._label_encoders]
-        return np.cumsum([0] + classes_max)
+                      ' will be removed in version 0.21')
+        n_categories = [len(le.classes_) for le in self._label_encoders]
+        return np.cumsum([0] + n_categories)
 
     @property
     def n_values_(self):
         warnings.warn('The property `n_values_` is deprecated and'
-                      ' will be removed in version 0.20')
-        return np.array([le.classes_.shape[0] for le in self._label_encoders])
+                      ' will be removed in version 0.21')
+        # The effective number of categories is different depending on
+        # whether or not we're using the old-style behavior
+        if self.handle_unknown == 'error':
+            return np.array([np.max(le.classes_) + 1
+                             for le in self._label_encoders])
+        else:
+            return np.array([le.classes_.shape[0]
+                             for le in self._label_encoders])
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 7752231e66e70..357951b87a370 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1461,9 +1461,8 @@ def test_one_hot_encoder_sparse():
     # discover max values automatically
     X_trans = enc.fit_transform(X).toarray()
     assert_equal(X_trans.shape, (2, 5))
-    assert_array_equal(enc.active_features_,
-                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
-    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
+    assert_array_equal(enc.active_features_, np.arange(5))
+    assert_array_equal(enc.feature_indices_, [0, 2, 4, 5])
 
     # check outcome
     assert_array_equal(X_trans,
@@ -1471,13 +1470,13 @@ def test_one_hot_encoder_sparse():
                         [1., 0., 1., 0., 1.]])
 
     # max value given as 3
-    enc = OneHotEncoder(n_values=4)
+    enc = OneHotEncoder(values=4)
     X_trans = enc.fit_transform(X)
     assert_equal(X_trans.shape, (2, 4 * 3))
     assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])
 
     # max value given per feature
-    enc = OneHotEncoder(n_values=[3, 2, 2])
+    enc = OneHotEncoder(values=[3, 2, 2])
     X = [[1, 0, 1], [0, 1, 1]]
     X_trans = enc.fit_transform(X)
     assert_equal(X_trans.shape, (2, 3 + 2 + 2))
@@ -1492,11 +1491,11 @@ def test_one_hot_encoder_sparse():
     error_msg = re.escape("Unknown feature(s) [2] in column 1")
     assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large)
 
-    error_msg = re.escape("Value(s) [2] out of bounds for feature(s) [0]")
+    error_msg = re.escape("Value(s) [ 2.] out of bounds for feature(s) [0]")
     assert_raises_regex(ValueError, error_msg,
-                        OneHotEncoder(n_values=2).fit_transform, X)
+                        OneHotEncoder(n_values=2).fit, X)
     assert_raises_regex(ValueError, error_msg,
-                        OneHotEncoder(values=2).fit_transform, X)
+                        OneHotEncoder(values=2).fit, X)
 
     # test that error is raised when wrong number of features
     assert_raises(ValueError, enc.transform, X[:, :-1])
@@ -1507,6 +1506,16 @@ def test_one_hot_encoder_sparse():
     assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)
 
 
+def test_one_hot_encoder_error_on_negative():
+    # Negative numerical values in inputs should raise an exception
+    X_bad = [[-1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]]
+    X_good = [[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]]
+    assert_raises(ValueError, OneHotEncoder().fit, X_bad)
+
+    ohe = OneHotEncoder().fit(X_good)
+    assert_raises(ValueError, ohe.transform, X_bad)
+
+
 def test_one_hot_encoder_attr():
     X = [[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]]
 
@@ -1533,9 +1542,8 @@ def test_one_hot_encoder_dense():
     # discover max values automatically
     X_trans = enc.fit_transform(X)
     assert_equal(X_trans.shape, (2, 5))
-    assert_array_equal(enc.active_features_,
-                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
-    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
+    assert_array_equal(enc.active_features_, np.arange(5))
+    assert_array_equal(enc.feature_indices_, [0, 2, 4, 5])
 
     # check outcome
     assert_array_equal(X_trans,
@@ -1632,12 +1640,14 @@ def test_one_hot_encoder_categorical_features():
 def test_one_hot_encoder_unknown_transform():
     X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
     y = np.array([[4, 1, 1]])
+    X_orig = X.copy()  # Verify X is not modified
 
     # Test that one hot encoder raises error for unknown features
     # present during transform.
     oh = OneHotEncoder(handle_unknown='error-strict')
     oh.fit(X)
     assert_raises(ValueError, oh.transform, y)
+    assert_array_equal(X, X_orig)
 
     # Test the ignore option, ignores unknown features.
     oh = OneHotEncoder(handle_unknown='ignore')
@@ -1645,10 +1655,12 @@ def test_one_hot_encoder_unknown_transform():
     assert_array_equal(
         oh.transform(y).toarray(),
         np.array([[0.,  0.,  0.,  0.,  1.,  0.,  0.]]))
+    assert_array_equal(X, X_orig)
 
     X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]],
                  dtype=np.object)
     y = np.array([['ET', 1, 1]], dtype=np.object)
+    X_orig = X.copy()  # Verify X is not modified
 
     # Test that one hot encoder raises error for unknown features
     # present during transform.
@@ -1672,6 +1684,7 @@ def test_one_hot_encoder_unknown_transform():
     assert_array_equal(
         oh.transform(y).toarray(),
         np.array([[0., 0., 0., 0., 0., 1., 0., 0.]]))
+    assert_array_equal(X, X_orig)
 
     # Raise error if handle_unknown is neither ignore nor error.
     oh = OneHotEncoder(handle_unknown='42')

From fd11366f3e4e238e7188488c9b376c445b165c45 Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 26 Apr 2017 14:16:03 -0500
Subject: [PATCH 22/36] WIP

---
 sklearn/preprocessing/data.py            | 136 +++++++++++++----------
 sklearn/preprocessing/tests/test_data.py |  10 +-
 2 files changed, 86 insertions(+), 60 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 99c7f16025c48..1785e85fd6214 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1692,7 +1692,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    values : 'auto', int, list of ints, or list of lists of objects
+    values : 'auto', 'auto-strict', int, List[int], or List[List[objects]]
         - 'auto' : determine set of values from training data. See the
           documentation of `handle_unknown` for which values are considered
           acceptable.
@@ -1785,43 +1785,39 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=False)
+        X = check_array(X, dtype=None, accept_sparse='csc', copy=False)
         n_samples, n_features = X.shape
 
         _apply_selected(X, self._fit, dtype=self.dtype,
                         selected=self.categorical_features, copy=False,
                         return_val=False)
 
+        # Record which columns of output data
+        # correspond to each column of input data
         self.feature_index_range_ = np.zeros((n_features, 2), dtype=np.int)
 
         if isinstance(self.categorical_features, six.string_types) and \
            self.categorical_features == "all":
-
             categorical = np.ones(n_features, dtype=bool)
         else:
             categorical = np.zeros(n_features, dtype=bool)
             categorical[np.asarray(self.categorical_features)] = True
 
-        num_cat = np.sum(categorical)
         start, end = 0, 0
-        cat_index = 0
-
-        for i in range(n_features):
-            if categorical[i]:
-                le = self._label_encoders[cat_index]
-                end = start + len(le.classes_)
-                self.feature_index_range_[i] = start, end
-                start = end
-                cat_index += 1
-
-        indices = np.arange(start, start + n_features - num_cat)
-        self.feature_index_range_[~categorical, 0] = indices
-        indices += 1
-        self.feature_index_range_[~categorical, 1] = indices
+        for i_cat, i_feat in enumerate(np.where(categorical)[0]):
+            le = self._label_encoders[i_cat]
+            end = start + len(le.classes_)
+            self.feature_index_range_[i_feat] = start, end
+            start = end
+        num_cat = np.sum(categorical)
+        non_cat_indices = np.arange(start, start + n_features - num_cat)
+        self.feature_index_range_[~categorical, 0] = non_cat_indices
+        self.feature_index_range_[~categorical, 1] = non_cat_indices + 1
 
+        # Record which column of input data corresponds
+        # to each column of output data
         n_expanded_cols = end + n_features - num_cat
         self.one_hot_feature_index_ = np.empty(n_expanded_cols, dtype=np.int)
-
         for i in range(n_features):
             s, e = self.feature_index_range_[i]
             self.one_hot_feature_index_[s:e] = i
@@ -1832,12 +1828,14 @@ def _check_values(self, values, n_features, max_values):
         """Verify that the input `values` is valid
 
         Raises ValueError or TypeError for bad `values`.
+        Assume that lists of integers have been converted
+        to lists of arrays before getting here.
         """
         error_msg = ("`values` should be 'auto', an integer, a list of"
                      " integers or a list of list")
         if isinstance(values, six.string_types):
             # Input "auto": determine values automatically
-            if values != 'auto':
+            if values not in ['auto', 'auto-strict']:
                 raise ValueError(error_msg)
         elif (isinstance(values, list) or
                 isinstance(values, np.ndarray)):
@@ -1845,30 +1843,34 @@ def _check_values(self, values, n_features, max_values):
                 raise ValueError("Shape mismatch: if values is a list,"
                                  " it has to be of length (n_features).")
 
-            # Either all entries are scalars or none are
+            # All entries are arrays or lists
             scalar_vals = [np.isscalar(val) for val in values]
-            if not (all(scalar_vals) or not any(scalar_vals)):
+            if any(scalar_vals):
                 raise ValueError(error_msg)
         elif not np.isscalar(values):
             raise TypeError(error_msg)
-
+        """
         # Validate input data against user-supplied categories
-        if not np.isscalar(values) and np.isscalar(values[0]):
+        if not np.isscalar(values):
             too_big = np.zeros(n_features, dtype=bool)
             for i_col in range(n_features):
                 if not np.isfinite(max_values[i_col]):
                     # String features; don't bounds-check
                     continue
-                if max_values[i_col] >= values[i_col]:
+                if max_values[i_col] > max(values[i_col]):
                     too_big[i_col] = True
 
             if too_big.any():
                 msg = 'Value(s) %s out of bounds for feature(s) %s'
                 raise ValueError(msg % (max_values[too_big],
                                         np.where(too_big)[0]))
-
+        """
     def _check_features_greater_than_zero(self, X):
         """Raise a ValueError if X has numerical values less than 0"""
+        if X.dtype.kind == 'U':
+            # Don't check string arrays
+            return
+
         if sparse.issparse(X):
             min_values, _ = sparse_min_max(X, axis=0)
         else:
@@ -1884,15 +1886,7 @@ def _check_features_greater_than_zero(self, X):
             raise ValueError('Column(s) %s have numerical values less '
                              'than zero.', np.where(lt_zero)[0])
 
-    def _fit(self, X):
-        """Assumes `X` contains only categorical features"""
-        n_samples, n_features = X.shape
-
-        self._n_features = n_features
-        self._label_encoders = [LabelEncoder() for i in range(n_features)]
-        self._set_max_values(X)
-        self._check_features_greater_than_zero(X)
-
+    def _initialize_values(self):
         # Set up and check user-input categories.
         if self.n_values is not None:
             warnings.warn('`n_values` has been renamed to `values`.'
@@ -1900,26 +1894,60 @@ def _fit(self, X):
                           'and will be removed in version 0.21, use the'
                           'parameter `values` instead and specify the '
                           'expected values for each feature')
-            self._values = self.n_values
+            values = self.n_values
         else:
-            self._values = self.values
-        if (not isinstance(self._values, six.string_types) and
-                np.isscalar(self._values)):
-            # Expect all categoricals to be integers with max `values`
-            self._values = np.ones(n_features, dtype=np.int) * self._values
+            values = self.values
+
+        # Convert `int` and `Sequence[int]` inputs to `List[Array[int]]`
+        if (not isinstance(values, six.string_types) and
+                np.isscalar(values)):
+            values = np.ones(self._n_features, dtype=int) * values
+        if (not isinstance(values, six.string_types) and
+                np.isscalar(values[0])):
+            values = [np.arange(v, dtype=np.int) for v in values]
+
+        return values
+
+    def _fit(self, X):
+        """Assumes `X` contains only categorical features"""
+        n_samples, n_features = X.shape
+
+        self._n_features = n_features
+        self._label_encoders = [LabelEncoder() for i in range(n_features)]
+        self._set_max_values(X)
+        self._check_features_greater_than_zero(X)
+        self._values = self._initialize_values()
         self._check_values(self._values, n_features, self._max_values)
 
+        _auto_int_classes = n_features * [None]
+
         # Fit on categorical features in the data
         for i in range(n_features):
             le = self._label_encoders[i]
 
             if np.isscalar(self._values) and self._values == 'auto':
-                le.fit(X[:, i])
+                if (not isinstance(X[0, i], six.string_types) and
+                        int(X[0, i]) == X[0, i]):
+                    _auto_int_classes[i] = np.unique(X[:, i])
+                    n_classes = np.max(_auto_int_classes[i]) + 1
+                    le.fit(np.arange(n_classes))
+                else:
+                    le.fit(X[:, i])
             else:
-                if isinstance(self._values[i], list):
-                    le.fit(self._values[i])
-                elif np.isscalar(self._values[i]):
-                    le.fit(np.arange(self._values[i], dtype=np.int))
+                le.fit(self._values[i])
+
+        if np.isscalar(self._values) and self._values == 'auto':
+            active_features = []
+            for i_col, int_classes in enumerate(_auto_int_classes):
+                if int_classes is None:
+                    n_classes = len(self._label_encoders[i_col].classes_)
+                    active_features.append(np.ones(n_classes, dtype=bool))
+                else:
+                    n_classes = max(self._label_encoders[i_col].classes_) + 1
+                    this_col_mask = np.zeros(n_classes, dtype=bool)
+                    this_col_mask[int_classes] = True
+                    active_features.append(this_col_mask)
+            self.active_features_ = np.where(np.hstack(active_features))[0]
 
     def _set_max_values(self, X):
         """Inspect input data to determine the maximum value in each column"""
@@ -1951,10 +1979,10 @@ def transform(self, X, y=None):
                         "'error-strict', or 'ignore', got %s")
             raise ValueError(template % self.handle_unknown)
 
-        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=False)
+        X = check_array(X, accept_sparse='csc', dtype=None, copy=False)
 
-        return _apply_selected(X, self._transform, copy=False,
-                               selected=self.categorical_features)
+        return _apply_selected(X, self._transform, dtype=self.dtype,
+                               selected=self.categorical_features, copy=False)
 
     def _transform(self, X):
         """Assumes `X` contains only categorical features."""
@@ -2002,14 +2030,10 @@ def _transform(self, X):
                                 shape=(n_samples, indices[-1]),
                                 dtype=self.dtype).tocsr()
 
-        return out if self.sparse else out.toarray()
+        if np.isscalar(self._values) and self._values == 'auto':
+            out = out[:, self.active_features_]
 
-    @property
-    def active_features_(self):
-        warnings.warn('The property `active_features_` is deprecated and'
-                      ' will be removed in version 0.21')
-        n_features_out = sum([len(le.classes_) for le in self._label_encoders])
-        return np.arange(n_features_out)
+        return out if self.sparse else out.toarray()
 
     @property
     def feature_indices_(self):
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 357951b87a370..90f0d1670441c 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1461,8 +1461,9 @@ def test_one_hot_encoder_sparse():
     # discover max values automatically
     X_trans = enc.fit_transform(X).toarray()
     assert_equal(X_trans.shape, (2, 5))
-    assert_array_equal(enc.active_features_, np.arange(5))
-    assert_array_equal(enc.feature_indices_, [0, 2, 4, 5])
+    assert_array_equal(enc.active_features_,
+                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
+    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
 
     # check outcome
     assert_array_equal(X_trans,
@@ -1542,8 +1543,9 @@ def test_one_hot_encoder_dense():
     # discover max values automatically
     X_trans = enc.fit_transform(X)
     assert_equal(X_trans.shape, (2, 5))
-    assert_array_equal(enc.active_features_, np.arange(5))
-    assert_array_equal(enc.feature_indices_, [0, 2, 4, 5])
+    assert_array_equal(enc.active_features_,
+                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
+    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
 
     # check outcome
     assert_array_equal(X_trans,

From b96a8d2d0a7cf5a25a7f1fc5e106024797fbb894 Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 26 Apr 2017 16:07:32 -0500
Subject: [PATCH 23/36] Remove error-strict, add auto-strict

Also restore `n_values_` and `active_features_` attributes.
---
 sklearn/preprocessing/data.py            | 222 ++++++++++-------------
 sklearn/preprocessing/tests/test_data.py |  44 ++---
 2 files changed, 120 insertions(+), 146 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 1785e85fd6214..f8b251ea3499f 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1693,9 +1693,11 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     Parameters
     ----------
     values : 'auto', 'auto-strict', int, List[int], or List[List[objects]]
-        - 'auto' : determine set of values from training data. See the
-          documentation of `handle_unknown` for which values are considered
-          acceptable.
+        - 'auto' : Determine set of values from training data.
+            If values are integers, then allowed values will be between
+            0 and the maximum value in the data.
+        - 'auto-strict' : Determine set of values from the training data.
+            Only values in the original training data are valid.
         - int : values are in ``range(values)`` for all features
         - list of ints : values for feature ``i`` are in ``range(values[i])``
         - list of lists : values for feature ``i`` are in ``values[i]``
@@ -1715,13 +1717,10 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     sparse : boolean, default=True
         Will return sparse matrix if set True else will return an array.
 
-    handle_unknown : str, 'error', 'error-strict', or 'ignore'
-
+    handle_unknown : str, 'error' or 'ignore'
         - 'ignore': Ignore all unknown feature values.
-        - 'error': Raise an error when the value of a feature is more than the
-          maximum value seen during fit.
-        - 'error-strict': Raise an error when the value of a feature is unseen
-          during`fit`.
+        - 'error': Raise an error when the value of a feature was not
+            in the original fit data (or given through ``values``).
 
     Attributes
     ----------
@@ -1733,6 +1732,14 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
         ``one_hot_feature_index_[i]`` specifies which feature of the input
         is encoded by column `i` in the one-hot encoded array.
 
+    active_features_ : array
+        Indices for active features, meaning values that actually occur
+        in the training set. Only available when n_values is ``'auto'``.
+
+    n_values_ : array of shape (n_features,)
+        Number of categories per feature. Has value `0` for
+        non-categorical features.
+
     Examples
     --------
     Given a dataset with three features and four samples, we let the encoder
@@ -1741,11 +1748,18 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     >>> from sklearn.preprocessing import OneHotEncoder
     >>> enc = OneHotEncoder()
-    >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) \
+    >>> enc.fit(np.array([['cat', 4], ['mouse', 15], ['dog', 17]], dtype='O'))\
         # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
         OneHotEncoder(categorical_features='all',
            dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
            sparse=True, values='auto')
+    >>> enc.n_values_
+    array([3, 18])
+    >>> enc.feature_index_range_
+    array([[ 0, 3],
+           [ 3, 6]])
+    >>> enc.one_hot_feature_index_
+    array([0, 0, 0, 1, 1, 1])
     >>> enc.transform([['dog', 4]]).toarray()
     array([[ 0.,  1.,  0.,  1.,  0.,  0.]])
 
@@ -1787,10 +1801,10 @@ def fit(self, X, y=None):
         """
         X = check_array(X, dtype=None, accept_sparse='csc', copy=False)
         n_samples, n_features = X.shape
+        self.n_features_ = n_features
 
-        _apply_selected(X, self._fit, dtype=self.dtype,
-                        selected=self.categorical_features, copy=False,
-                        return_val=False)
+        _apply_selected(X, self._fit, dtype=self.dtype, return_val=False,
+                        selected=self.categorical_features, copy=False)
 
         # Record which columns of output data
         # correspond to each column of input data
@@ -1805,8 +1819,10 @@ def fit(self, X, y=None):
 
         start, end = 0, 0
         for i_cat, i_feat in enumerate(np.where(categorical)[0]):
-            le = self._label_encoders[i_cat]
-            end = start + len(le.classes_)
+            if np.isscalar(self._values) and self._values == 'auto':
+                end = start + self.n_active_features_[i_cat]
+            else:
+                end = start + len(self._label_encoders[i_cat].classes_)
             self.feature_index_range_[i_feat] = start, end
             start = end
         num_cat = np.sum(categorical)
@@ -1822,17 +1838,24 @@ def fit(self, X, y=None):
             s, e = self.feature_index_range_[i]
             self.one_hot_feature_index_[s:e] = i
 
+        # Count categories per feature
+        n_val = len(non_cat_indices) * [0]
+        if hasattr(self, '_label_encoders'):
+            n_val = [len(le.classes_) for le in self._label_encoders] + n_val
+        self.n_values_ = np.array(n_val)
+
         return self
 
-    def _check_values(self, values, n_features, max_values):
+    def _check_values(self, values, n_features):
         """Verify that the input `values` is valid
 
         Raises ValueError or TypeError for bad `values`.
-        Assume that lists of integers have been converted
-        to lists of arrays before getting here.
+        Assume that integers or lists of integers have been
+        converted to lists of arrays before getting here.
+        This should run after `_initialize_values`.
         """
-        error_msg = ("`values` should be 'auto', an integer, a list of"
-                     " integers or a list of list")
+        error_msg = ("`values` should be 'auto', 'auto-strict', an integer, "
+                     "a list of integers or a list of list")
         if isinstance(values, six.string_types):
             # Input "auto": determine values automatically
             if values not in ['auto', 'auto-strict']:
@@ -1847,47 +1870,11 @@ def _check_values(self, values, n_features, max_values):
             scalar_vals = [np.isscalar(val) for val in values]
             if any(scalar_vals):
                 raise ValueError(error_msg)
-        elif not np.isscalar(values):
-            raise TypeError(error_msg)
-        """
-        # Validate input data against user-supplied categories
-        if not np.isscalar(values):
-            too_big = np.zeros(n_features, dtype=bool)
-            for i_col in range(n_features):
-                if not np.isfinite(max_values[i_col]):
-                    # String features; don't bounds-check
-                    continue
-                if max_values[i_col] > max(values[i_col]):
-                    too_big[i_col] = True
-
-            if too_big.any():
-                msg = 'Value(s) %s out of bounds for feature(s) %s'
-                raise ValueError(msg % (max_values[too_big],
-                                        np.where(too_big)[0]))
-        """
-    def _check_features_greater_than_zero(self, X):
-        """Raise a ValueError if X has numerical values less than 0"""
-        if X.dtype.kind == 'U':
-            # Don't check string arrays
-            return
-
-        if sparse.issparse(X):
-            min_values, _ = sparse_min_max(X, axis=0)
         else:
-            min_values = np.min(X, axis=0)
-        lt_zero = np.zeros(X.shape[1], dtype=bool)
-        for i_value, value in enumerate(min_values):
-            if isinstance(value, six.string_types):
-                continue
-            elif value < 0:
-                lt_zero[i_value] = True
-
-        if np.any(lt_zero):
-            raise ValueError('Column(s) %s have numerical values less '
-                             'than zero.', np.where(lt_zero)[0])
+            raise TypeError(error_msg)
 
     def _initialize_values(self):
-        # Set up and check user-input categories.
+        """Standardize the `values` input"""
         if self.n_values is not None:
             warnings.warn('`n_values` has been renamed to `values`.'
                           'The parameter `n_values` has been deprecated '
@@ -1901,7 +1888,7 @@ def _initialize_values(self):
         # Convert `int` and `Sequence[int]` inputs to `List[Array[int]]`
         if (not isinstance(values, six.string_types) and
                 np.isscalar(values)):
-            values = np.ones(self._n_features, dtype=int) * values
+            values = np.ones(self.n_features_cat_, dtype=int) * values
         if (not isinstance(values, six.string_types) and
                 np.isscalar(values[0])):
             values = [np.arange(v, dtype=np.int) for v in values]
@@ -1911,32 +1898,41 @@ def _initialize_values(self):
     def _fit(self, X):
         """Assumes `X` contains only categorical features"""
         n_samples, n_features = X.shape
-
-        self._n_features = n_features
+        self.n_features_cat_ = n_features
         self._label_encoders = [LabelEncoder() for i in range(n_features)]
-        self._set_max_values(X)
-        self._check_features_greater_than_zero(X)
-        self._values = self._initialize_values()
-        self._check_values(self._values, n_features, self._max_values)
 
-        _auto_int_classes = n_features * [None]
+        self._values = self._initialize_values()
+        self._check_values(self._values, n_features)
 
         # Fit on categorical features in the data
+        _auto_int_classes = n_features * [None]
         for i in range(n_features):
             le = self._label_encoders[i]
 
             if np.isscalar(self._values) and self._values == 'auto':
+                # For integer features, allow integers between
+                # 0 and column max. The transform will still only
+                # return dummy columns for integers present in training data.
                 if (not isinstance(X[0, i], six.string_types) and
                         int(X[0, i]) == X[0, i]):
-                    _auto_int_classes[i] = np.unique(X[:, i])
+                    _auto_int_classes[i] = np.unique(X[:, i]).astype(int)
+                    if np.min(_auto_int_classes[i]) < 0:
+                        msg = ('Column %s has value(s) less than zero; all '
+                               'integer columns must have minimum value '
+                               '0 when value="auto".')
+                        raise ValueError(msg)
                     n_classes = np.max(_auto_int_classes[i]) + 1
                     le.fit(np.arange(n_classes))
                 else:
                     le.fit(X[:, i])
+            elif np.isscalar(self._values) and self._values == 'auto-strict':
+                le.fit(X[:, i])
             else:
                 le.fit(self._values[i])
 
         if np.isscalar(self._values) and self._values == 'auto':
+            # Record which integer features were present in training
+            # data so we can restrict output columns.
             active_features = []
             for i_col, int_classes in enumerate(_auto_int_classes):
                 if int_classes is None:
@@ -1947,20 +1943,10 @@ def _fit(self, X):
                     this_col_mask = np.zeros(n_classes, dtype=bool)
                     this_col_mask[int_classes] = True
                     active_features.append(this_col_mask)
+            self.n_active_features_ = np.array([a.sum()
+                                                for a in active_features])
             self.active_features_ = np.where(np.hstack(active_features))[0]
 
-    def _set_max_values(self, X):
-        """Inspect input data to determine the maximum value in each column"""
-        if sparse.issparse(X):
-            min_values, max_values = sparse_min_max(X, axis=0)
-        else:
-            max_values = np.max(X, axis=0)
-        self._max_values = np.zeros(len(max_values)) + np.nan
-        for i_value, value in enumerate(max_values):
-            if isinstance(value, six.string_types):
-                continue
-            self._max_values[i_value] = value
-
     def transform(self, X, y=None):
         """Encode the selected categorical features using the one-hot scheme.
 
@@ -1974,12 +1960,15 @@ def transform(self, X, y=None):
         out : array, shape[n_samples, n_features_new]
             `X` encoded using the one-hot scheme.
         """
-        if self.handle_unknown not in ['ignore', 'error', 'error-strict']:
-            template = ("handle_unknown should be either 'error', "
-                        "'error-strict', or 'ignore', got %s")
+        if self.handle_unknown not in ['ignore', 'error']:
+            template = ("handle_unknown should be either 'error' "
+                        "or 'ignore', got %s")
             raise ValueError(template % self.handle_unknown)
 
         X = check_array(X, accept_sparse='csc', dtype=None, copy=False)
+        if X.shape[1] != self.n_features_:
+            raise ValueError("Input data must have %s "
+                             "features." % self.n_features_)
 
         return _apply_selected(X, self._transform, dtype=self.dtype,
                                selected=self.categorical_features, copy=False)
@@ -1987,44 +1976,37 @@ def transform(self, X, y=None):
     def _transform(self, X):
         """Assumes `X` contains only categorical features."""
         n_samples, n_features = X.shape
-        X_int = np.zeros_like(X, dtype=np.int)
-        X_mask = np.ones_like(X, dtype=np.bool)
+        X_int = np.zeros_like(X, dtype=np.int32)
 
-        for i in range(n_features):
-            le = self._label_encoders[i]
-            valid_mask = in1d(X[:, i], le.classes_)
-            if not np.all(valid_mask):
-                if self.handle_unknown in ['error', 'error-strict']:
+        # Recode all columns of input data as integers
+        if self.handle_unknown == 'error':
+            for i, le in enumerate(self._label_encoders):
+                try:
+                    X_int[:, i] = le.transform(X[:, i])
+                except ValueError:
                     diff = setdiff1d(X[:, i], le.classes_)
-                    if (self.handle_unknown == 'error-strict' or
-                            np.isfinite(self._max_values[i]) and
-                            np.any(diff >= self._max_values[i]) or
-                            np.any(diff < 0)):
-                        msg = 'Unknown feature(s) %s in column %d' % (diff, i)
-                        raise ValueError(msg)
-                    else:
-                        msg = ('Values %s for feature %d are unknown but '
-                               'in range. This will raise an error in '
-                               'future versions where "error-strict" will '
-                               'be default for `handle_unknown` parameter'
-                               % (str(diff), i))
-                        warnings.warn(FutureWarning(msg))
-
-                X_mask[:, i] = valid_mask
-                X_int[valid_mask, i] = (self._label_encoders[i]
-                                        .transform(X[valid_mask, i]))
-            else:
-                X_int[:, i] = self._label_encoders[i].transform(X[:, i])
+                    msg = 'Unknown feature(s) %s in column %d' % (diff, i)
+                    raise ValueError(msg)
+            mask = slice(None)
+        else:
+            X_mask = np.ones_like(X, dtype=np.bool)
+            for i, le in enumerate(self._label_encoders):
+                valid_mask = in1d(X[:, i], le.classes_)
+                if not np.all(valid_mask):
+                    X_mask[:, i] = valid_mask
+                    X_int[valid_mask, i] = le.transform(X[valid_mask, i])
+                else:
+                    X_int[:, i] = le.transform(X[:, i])
+            mask = X_mask.ravel()
 
-        mask = X_mask.ravel()
-        n_values = [le.classes_.shape[0] for le in self._label_encoders]
-        n_values = np.hstack([[0], n_values])
+        # Convert integer columns to sparse array of binary indicators
+        n_values = [0] + [le.classes_.shape[0] for le in self._label_encoders]
         indices = np.cumsum(n_values)
 
         column_indices = (X_int + indices[:-1]).ravel()[mask]
         row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                 n_features)[mask]
-        data = np.ones(np.sum(mask))
+        data = np.ones(len(row_indices), dtype=self.dtype)
 
         out = sparse.coo_matrix((data, (row_indices, column_indices)),
                                 shape=(n_samples, indices[-1]),
@@ -2037,20 +2019,10 @@ def _transform(self, X):
 
     @property
     def feature_indices_(self):
+        # This is very similar to the current attribute
+        # `feature_index_range_`, but only applies to the
+        # subset of categorical features.
         warnings.warn('The property `feature_indices_` is deprecated and'
                       ' will be removed in version 0.21')
         n_categories = [len(le.classes_) for le in self._label_encoders]
         return np.cumsum([0] + n_categories)
-
-    @property
-    def n_values_(self):
-        warnings.warn('The property `n_values_` is deprecated and'
-                      ' will be removed in version 0.21')
-        # The effective number of categories is different depending on
-        # whether or not we're using the old-style behavior
-        if self.handle_unknown == 'error':
-            return np.array([np.max(le.classes_) + 1
-                             for le in self._label_encoders])
-        else:
-            return np.array([le.classes_.shape[0]
-                             for le in self._label_encoders])
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 90f0d1670441c..fb43e519eaaf3 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1492,11 +1492,11 @@ def test_one_hot_encoder_sparse():
     error_msg = re.escape("Unknown feature(s) [2] in column 1")
     assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large)
 
-    error_msg = re.escape("Value(s) [ 2.] out of bounds for feature(s) [0]")
+    error_msg = re.escape("Unknown feature(s) [2] in column 0")
     assert_raises_regex(ValueError, error_msg,
-                        OneHotEncoder(n_values=2).fit, X)
+                        OneHotEncoder(n_values=2).fit_transform, X)
     assert_raises_regex(ValueError, error_msg,
-                        OneHotEncoder(values=2).fit, X)
+                        OneHotEncoder(values=2).fit_transform, X)
 
     # test that error is raised when wrong number of features
     assert_raises(ValueError, enc.transform, X[:, :-1])
@@ -1509,8 +1509,8 @@ def test_one_hot_encoder_sparse():
 
 def test_one_hot_encoder_error_on_negative():
     # Negative numerical values in inputs should raise an exception
-    X_bad = [[-1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]]
-    X_good = [[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]]
+    X_bad = np.array([[-1, "cat"], [10, "mouse"], [5, "cat"]], dtype=np.object)
+    X_good = np.array([[1, "cat"], [10, "mouse"], [5, "cat"]], dtype=np.object)
     assert_raises(ValueError, OneHotEncoder().fit, X_bad)
 
     ohe = OneHotEncoder().fit(X_good)
@@ -1518,7 +1518,7 @@ def test_one_hot_encoder_error_on_negative():
 
 
 def test_one_hot_encoder_attr():
-    X = [[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]]
+    X = np.array([[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]], dtype='O')
 
     enc = OneHotEncoder()
     enc.fit(X)
@@ -1639,14 +1639,14 @@ def test_one_hot_encoder_categorical_features():
     _check_one_hot(X, X2, cat, 5)
 
 
-def test_one_hot_encoder_unknown_transform():
+def test_one_hot_encoder_unknown_transform_int():
     X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
-    y = np.array([[4, 1, 1]])
+    y = np.array([[0, 3, 1]])
     X_orig = X.copy()  # Verify X is not modified
 
     # Test that one hot encoder raises error for unknown features
     # present during transform.
-    oh = OneHotEncoder(handle_unknown='error-strict')
+    oh = OneHotEncoder(handle_unknown='error')
     oh.fit(X)
     assert_raises(ValueError, oh.transform, y)
     assert_array_equal(X, X_orig)
@@ -1656,9 +1656,21 @@ def test_one_hot_encoder_unknown_transform():
     oh.fit(X)
     assert_array_equal(
         oh.transform(y).toarray(),
-        np.array([[0.,  0.,  0.,  0.,  1.,  0.,  0.]]))
+        np.array([[1.,  0.,  0.,  0.,  1.,  0.,  0.]]))
     assert_array_equal(X, X_orig)
 
+    # Test that there's no error for integer features in the auto range
+    y = [[0, 1, 1]]
+    assert_array_equal(oh.transform(y).toarray(),
+                       np.array([[1.,  0.,  0.,  0.,  1.,  0.,  0.]]))
+
+    # But we do error when fit with "auto-strict"
+    oh = OneHotEncoder(values='auto-strict', handle_unknown='error')
+    oh.fit(X)
+    assert_raises(ValueError, oh.transform, y)
+
+
+def test_one_hot_encoder_unknown_transform_object():
     X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]],
                  dtype=np.object)
     y = np.array([['ET', 1, 1]], dtype=np.object)
@@ -1666,19 +1678,9 @@ def test_one_hot_encoder_unknown_transform():
 
     # Test that one hot encoder raises error for unknown features
     # present during transform.
-    oh = OneHotEncoder(handle_unknown='error-strict')
-    oh.fit(X)
-    assert_raises(ValueError, oh.transform, y)
-
-    # Test that one hot encoder raises warning for unknown but in range
-    # features
     oh = OneHotEncoder(handle_unknown='error')
     oh.fit(X)
-    msg = ('Values [0] for feature 2 are unknown but in range. '
-           'This will raise an error in future versions where "error-strict"'
-           ' will be default for `handle_unknown` parameter')
-    assert_warns_message(FutureWarning, msg, oh.transform,
-                         np.array([['mouse', 0, 0]], dtype=np.object))
+    assert_raises(ValueError, oh.transform, y)
 
     # Test the ignore option, ignores unknown features.
     oh = OneHotEncoder(handle_unknown='ignore')

From 7902352bcd75613fb15a050d92117ef8ac5eff1e Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 26 Apr 2017 16:30:04 -0500
Subject: [PATCH 24/36] Fixes for test failures

---
 sklearn/preprocessing/data.py | 8 ++++----
 sklearn/utils/fixes.py        | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index f8b251ea3499f..c562539d4c9c4 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -19,7 +19,7 @@
 from ..utils import check_array
 from ..utils.extmath import row_norms
 from ..utils.extmath import _incremental_mean_and_var
-from ..utils.fixes import bincount, sparse_min_max
+from ..utils.fixes import bincount
 from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
                                       inplace_csr_row_normalize_l2)
 from ..utils.sparsefuncs import (inplace_column_scale,
@@ -1754,10 +1754,10 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
            dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
            sparse=True, values='auto')
     >>> enc.n_values_
-    array([3, 18])
+    array([ 3, 18])
     >>> enc.feature_index_range_
-    array([[ 0, 3],
-           [ 3, 6]])
+    array([[0, 3],
+           [3, 6]])
     >>> enc.one_hot_feature_index_
     array([0, 0, 0, 1, 1, 1])
     >>> enc.transform([['dog', 4]]).toarray()
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index fe37e9469c720..d44555503eaa2 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -240,7 +240,8 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
         ar1 = np.asarray(ar1).ravel()
         ar2 = np.asarray(ar2).ravel()
 
-        if ar1.dtype == object or ar2.dtype == object:
+        if (ar1.dtype == object or ar2.dtype == object or
+                ar1.dtype.kind == 'U' or ar2.dtype.kind == 'U'):
             return _in1d_object(ar1, ar2, invert)
 
         # This code is significantly faster when the condition is satisfied.

From 4206d797128f39902639f528e23ac2fadb991a2b Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 26 Apr 2017 17:54:21 -0500
Subject: [PATCH 25/36] ENH Handle object and string types in
 LabelEncoder.transform

Numpy v1.6 doesn't handle `setdiff1d` for string types; use a backported version in `utils.fixes`.
---
 sklearn/preprocessing/label.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index f2f7d9afad347..3957cb4a63e56 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -18,7 +18,7 @@
 from ..utils.fixes import np_version
 from ..utils.fixes import sparse_min_max
 from ..utils.fixes import astype
-from ..utils.fixes import in1d
+from ..utils.fixes import in1d, setdiff1d
 from ..utils import column_or_1d
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
@@ -149,7 +149,7 @@ def transform(self, y):
         classes = np.unique(y)
         _check_numpy_unicode_bug(classes)
         if len(np.intersect1d(classes, self.classes_)) < len(classes):
-            diff = np.setdiff1d(classes, self.classes_)
+            diff = setdiff1d(classes, self.classes_)
             raise ValueError("y contains new labels: %s" % str(diff))
         return np.searchsorted(self.classes_, y)
 

From d96fbc63da3c106c75c72d831dead43eaa6b74d7 Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 26 Apr 2017 17:56:06 -0500
Subject: [PATCH 26/36] Fix tests

---
 doc/modules/preprocessing.rst            |  2 +-
 sklearn/preprocessing/data.py            | 12 +++++++-----
 sklearn/preprocessing/tests/test_data.py |  4 ++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index f1bfba00dde01..d9d6209f123d6 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -400,7 +400,7 @@ Continuing the example above::
   >>> enc.fit([['female', 'from US', 'uses Chrome'],
   ... ['male', 'from Asia', 'uses Firefox']])  \
   ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-  OneHotEncoder(categorical_features='all', copy=True,
+  OneHotEncoder(categorical_features='all',
          dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
          sparse=True, values='auto')
   >>> enc.transform([['female', 'from Asia', 'uses Firefox']]).toarray()
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index c562539d4c9c4..cf6f3b214a01a 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -7,7 +7,6 @@
 # License: BSD 3 clause
 
 from itertools import chain, combinations
-import numbers
 import warnings
 from itertools import combinations_with_replacement as combinations_w_r
 
@@ -27,7 +26,7 @@
                                  min_max_axis)
 from ..utils.validation import check_is_fitted, FLOAT_DTYPES
 from .label import LabelEncoder
-from ..utils.fixes import in1d, setdiff1d
+from ..utils.fixes import in1d
 
 
 zip = six.moves.zip
@@ -1983,9 +1982,12 @@ def _transform(self, X):
             for i, le in enumerate(self._label_encoders):
                 try:
                     X_int[:, i] = le.transform(X[:, i])
-                except ValueError:
-                    diff = setdiff1d(X[:, i], le.classes_)
-                    msg = 'Unknown feature(s) %s in column %d' % (diff, i)
+                except ValueError as err:
+                    orig_msg = str(err)
+                    if not orig_msg.startswith('y contains'):
+                        raise
+                    else:
+                        msg = 'Column %d %s' % (i, orig_msg[2:])
                     raise ValueError(msg)
             mask = slice(None)
         else:
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index fb43e519eaaf3..34077f2694388 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1489,10 +1489,10 @@ def test_one_hot_encoder_sparse():
     # test that an error is raised when out of bounds:
     X_too_large = [[0, 2, 1], [0, 1, 1]]
     assert_raises(ValueError, enc.transform, X_too_large)
-    error_msg = re.escape("Unknown feature(s) [2] in column 1")
+    error_msg = re.escape("Column 1 contains new labels: [2]")
     assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large)
 
-    error_msg = re.escape("Unknown feature(s) [2] in column 0")
+    error_msg = re.escape("Column 0 contains new labels: [2]")
     assert_raises_regex(ValueError, error_msg,
                         OneHotEncoder(n_values=2).fit_transform, X)
     assert_raises_regex(ValueError, error_msg,

From 0807604f59b4a769c3ce3dfff4a26a5475497370 Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 26 Apr 2017 19:31:59 -0500
Subject: [PATCH 27/36] Fix for doc test and scipy 0.11 sparse behavior

---
 doc/modules/preprocessing.rst | 2 +-
 sklearn/preprocessing/data.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index d9d6209f123d6..cf5c312eb5e06 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -426,7 +426,7 @@ features, one has to explicitly set ``n_values``. For example,
     >>> enc.fit([['female', 'from US', 'uses Chrome'],
     ... ['male', 'from Asia', 'uses Internet Explorer']]) \
     ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-    OneHotEncoder(categorical_features='all', copy=True,
+    OneHotEncoder(categorical_features='all',
            dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
            sparse=True,
            values=[...])
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index cf6f3b214a01a..1696d6846b419 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1663,10 +1663,11 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
         # All features selected.
         return transform(X)
     else:
-        X_sel = transform(X[:, sel])
+        ind = np.arange(n_features)
+        X_sel = transform(X[:, ind[sel]])
 
         if return_val:
-            X_not_sel = X[:, not_sel].astype(dtype)
+            X_not_sel = X[:, ind[not_sel]].astype(dtype)
             if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
                 return sparse.hstack((X_sel, X_not_sel))
             else:

From b6d198ad291e6574527fd50b8e036435be76778e Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 26 Apr 2017 19:51:10 -0500
Subject: [PATCH 28/36] ENH Enforce dtypes in _apply_selected

---
 sklearn/preprocessing/data.py            | 11 +++++++----
 sklearn/preprocessing/tests/test_data.py |  9 +++++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 1696d6846b419..ac705501f3180 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1645,10 +1645,11 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
         X = X.copy()
 
     if isinstance(selected, six.string_types) and selected == "all":
-        return transform(X)
+        X_trans = transform(X)
+        return X_trans.astype(dtype) if return_val else None
 
     if len(selected) == 0:
-        return X
+        return X.astype(dtype) if return_val else None
 
     n_features = X.shape[1]
     sel = np.zeros(n_features, dtype=bool)
@@ -1658,15 +1659,17 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
 
     if n_selected == 0:
         # No features selected.
-        return X
+        return X.astype(dtype) if return_val else None
     elif n_selected == n_features:
         # All features selected.
-        return transform(X)
+        X_trans = transform(X)
+        return X_trans.astype(dtype) if return_val else None
     else:
         ind = np.arange(n_features)
         X_sel = transform(X[:, ind[sel]])
 
         if return_val:
+            X_sel = X_sel.astype(dtype)
             X_not_sel = X[:, ind[not_sel]].astype(dtype)
             if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
                 return sparse.hstack((X_sel, X_not_sel))
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 34077f2694388..770b5ff2e4af8 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1553,10 +1553,11 @@ def test_one_hot_encoder_dense():
                                  [1., 0., 1., 0., 1.]]))
 
 
-def _check_apply_selected(X, X_expected, sel):
+def _check_apply_selected(X, X_expected, sel, dtype=np.float):
     for M in (X, sparse.csr_matrix(X)):
-        Xtr = _apply_selected(M, Binarizer().transform, sel)
+        Xtr = _apply_selected(M, Binarizer().transform, sel, dtype=dtype)
         assert_array_equal(toarray(Xtr), X_expected)
+        assert_equal(toarray(Xtr).dtype, dtype)
 
 
 def test_transform_selected():
@@ -1565,14 +1566,18 @@ def test_transform_selected():
     X_expected = [[1, 2, 1], [0, 1, 1]]
     _check_apply_selected(X, X_expected, [0])
     _check_apply_selected(X, X_expected, [True, False, False])
+    _check_apply_selected(X, X_expected, [True, False, False], dtype=np.int)
 
     X_expected = [[1, 1, 1], [0, 1, 1]]
     _check_apply_selected(X, X_expected, [0, 1, 2])
+    _check_apply_selected(X, X_expected, [0, 1, 2], dtype=np.int)
     _check_apply_selected(X, X_expected, [True, True, True])
     _check_apply_selected(X, X_expected, "all")
+    _check_apply_selected(X, X_expected, "all", dtype=np.int)
 
     _check_apply_selected(X, X, [])
     _check_apply_selected(X, X, [False, False, False])
+    _check_apply_selected(X, X, [False, False, False], dtype=np.int)
 
 
 def test_transform_selected_copy_arg():

From 7db5cedb62f055835876692dd3481cbe22990af4 Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 26 Apr 2017 20:30:03 -0500
Subject: [PATCH 29/36] TST More tests for OneHotEncoder

---
 sklearn/preprocessing/tests/test_data.py | 81 ++++++++++++++++--------
 1 file changed, 56 insertions(+), 25 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 770b5ff2e4af8..3d361d0d8af91 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1524,16 +1524,19 @@ def test_one_hot_encoder_attr():
     enc.fit(X)
     assert_array_equal(enc.feature_index_range_, [[0, 3], [3, 5], [5, 7]])
     assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 1, 1, 2, 2])
+    assert_array_equal(enc.n_values_, [11, 16, 2])
 
-    enc = OneHotEncoder(categorical_features=[True, False, True])
-    enc.fit(X)
-    assert_array_equal(enc.feature_index_range_, [[0, 3], [5, 6], [3, 5]])
-    assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 2, 2, 1])
+    oh = OneHotEncoder('auto-strict', categorical_features=[True, False, True])
+    oh.fit(X)
+    assert_array_equal(oh.feature_index_range_, [[0, 3], [5, 6], [3, 5]])
+    assert_array_equal(oh.one_hot_feature_index_, [0, 0, 0, 2, 2, 1])
+    assert_array_equal(oh.n_values_, [3, 2, 0])
 
     enc = OneHotEncoder(categorical_features=[False, False, True])
     enc.fit(X)
     assert_array_equal(enc.feature_index_range_, [[2, 3], [3, 4], [0, 2]])
     assert_array_equal(enc.one_hot_feature_index_, [2, 2, 0, 1])
+    assert_array_equal(enc.n_values_, [2, 0, 0])
 
 
 def test_one_hot_encoder_dense():
@@ -1604,7 +1607,7 @@ def _run_one_hot(X, X2, cat):
     return Xtr, X2tr
 
 
-def _check_one_hot(X, X2, cat, n_features):
+def _check_one_hot(X, X2, cat, n_features, X_exp, X2_exp):
     ind = np.where(cat)[0]
     # With mask
     A, B = _run_one_hot(X, X2, cat)
@@ -1619,6 +1622,9 @@ def _check_one_hot(X, X2, cat, n_features):
     assert_array_equal(toarray(A), toarray(C))
     assert_array_equal(toarray(B), toarray(D))
 
+    assert_array_equal(toarray(A), X_exp)
+    assert_array_equal(toarray(B), X2_exp)
+
 
 def test_one_hot_encoder_string():
     X = [['cat', 'domestic'], ['wolf', 'wild']]
@@ -1633,15 +1639,30 @@ def test_one_hot_encoder_categorical_features():
     X2 = np.array([[1, 1, 1]])
 
     cat = [True, False, False]
-    _check_one_hot(X, X2, cat, 4)
+    X_exp = [[0, 1, 2, 1], [1, 0, 1, 1]]
+    X2_exp = [[0, 0, 1, 1]]
+    _check_one_hot(X, X2, cat, 4, X_exp, X2_exp)
 
     # Edge case: all non-categorical
     cat = [False, False, False]
-    _check_one_hot(X, X2, cat, 3)
+    _check_one_hot(X, X2, cat, 3, X, X2)
 
     # Edge case: all categorical
+    X_exp = [[0, 1, 0, 1, 1], [1, 0, 1, 0, 1]]
+    X2_exp = [[0, 0, 1, 0, 1]]
     cat = [True, True, True]
-    _check_one_hot(X, X2, cat, 5)
+    _check_one_hot(X, X2, cat, 5, X_exp, X2_exp)
+
+
+def test_one_hot_encoder_dtypes():
+    # Verify that we can control the output dtype of the transform
+    X = np.array([['cat', 2.1, 1], ['dog', 1, 3], ['mouse', 1, 2]], dtype='O')
+
+    for dtype in [np.int8, np.float, np.bool]:
+        for sparse in [True, False]:
+            oh = OneHotEncoder('auto-strict', dtype=dtype, sparse=sparse)
+            X_tr = oh.fit_transform(X)
+            assert_equal(X_tr.dtype, dtype)
 
 
 def test_one_hot_encoder_unknown_transform_int():
@@ -1656,29 +1677,26 @@ def test_one_hot_encoder_unknown_transform_int():
     assert_raises(ValueError, oh.transform, y)
     assert_array_equal(X, X_orig)
 
-    # Test the ignore option, ignores unknown features.
-    oh = OneHotEncoder(handle_unknown='ignore')
-    oh.fit(X)
-    assert_array_equal(
-        oh.transform(y).toarray(),
-        np.array([[1.,  0.,  0.,  0.,  1.,  0.,  0.]]))
-    assert_array_equal(X, X_orig)
-
     # Test that there's no error for integer features in the auto range
     y = [[0, 1, 1]]
-    assert_array_equal(oh.transform(y).toarray(),
-                       np.array([[1.,  0.,  0.,  0.,  1.,  0.,  0.]]))
+    assert_array_equal(toarray(oh.transform(y)), [[1,  0,  0,  0,  1,  0,  0]])
 
     # But we do error when fit with "auto-strict"
     oh = OneHotEncoder(values='auto-strict', handle_unknown='error')
     oh.fit(X)
     assert_raises(ValueError, oh.transform, y)
 
+    # Test the ignore option, ignores unknown features.
+    oh = OneHotEncoder(handle_unknown='ignore')
+    oh.fit(X)
+    assert_array_equal(toarray(oh.transform(y)), [[1,  0,  0,  0,  1,  0,  0]])
+    assert_array_equal(X, X_orig)
+
 
 def test_one_hot_encoder_unknown_transform_object():
-    X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]],
+    X = np.array([['cat', 2.1, 1], ['dog', 1.1, 3], ['mouse', 1.1, 2]],
                  dtype=np.object)
-    y = np.array([['ET', 1, 1]], dtype=np.object)
+    y = np.array([['ET', 2.1, 1]], dtype=np.object)
     X_orig = X.copy()  # Verify X is not modified
 
     # Test that one hot encoder raises error for unknown features
@@ -1686,18 +1704,31 @@ def test_one_hot_encoder_unknown_transform_object():
     oh = OneHotEncoder(handle_unknown='error')
     oh.fit(X)
     assert_raises(ValueError, oh.transform, y)
+    assert_array_equal(X, X_orig)
 
     # Test the ignore option, ignores unknown features.
     oh = OneHotEncoder(handle_unknown='ignore')
     oh.fit(X)
-    assert_array_equal(
-        oh.transform(y).toarray(),
-        np.array([[0., 0., 0., 0., 0., 1., 0., 0.]]))
+    assert_array_equal(oh.transform(y).toarray(), [[0, 0, 0, 0, 1, 1, 0, 0]])
     assert_array_equal(X, X_orig)
 
     # Raise error if handle_unknown is neither ignore nor error.
-    oh = OneHotEncoder(handle_unknown='42')
-    oh.fit(X)
+    oh = OneHotEncoder(handle_unknown='42').fit(X)
+    assert_raises(ValueError, oh.transform, y)
+    assert_array_equal(X, X_orig)
+
+    # Check that in-range integer features are okay in object arrays
+    y = np.array([['cat', 2.1, 0]], dtype=np.object)
+    oh = OneHotEncoder(handle_unknown='error').fit(X)
+    assert_array_equal(oh.transform(y).toarray(), [[1, 0, 0, 0, 1, 0, 0, 0]])
+
+    # "in-range" but not in-training-data float features will error
+    y = np.array([['cat', 1.8, 1]], dtype=np.object)
+    oh = OneHotEncoder(handle_unknown='error').fit(X)
+    assert_raises(ValueError, oh.transform, y)
+
+    # A transform on in-range integers errors in 'auto-strict' mode.
+    oh = OneHotEncoder(values='auto-strict', handle_unknown='error').fit(X)
     assert_raises(ValueError, oh.transform, y)
 
 

From ac9e455c88763f66835e0967fe19df76d484e2b3 Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 26 Apr 2017 20:54:21 -0500
Subject: [PATCH 30/36] DOC Add What's new and polish docstring for OHE

---
 doc/whats_new.rst                        | 12 ++++++++-
 sklearn/preprocessing/data.py            | 33 +++++++++++++-----------
 sklearn/preprocessing/tests/test_data.py |  4 +--
 3 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 9a092310f4924..d86f5fa0cc7ed 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -171,6 +171,16 @@ Enhancements
      removed by setting it to `None`.
      :issue:`7674` by:user:`Yichuan Liu <yl565>`.
 
+   - :class:`preprocessing.OneHotEncoder` now fits and transforms inputs of
+     any numerical or string type instead of only integer arrays.
+     It has addtional fitted attributes ``feature_index_range_`` and
+     ``one_hot_feature_index_``. The ``feature_indices_`` has been deprecated.
+     The ``n_values`` parameter is deprecated in favor of ``values``.
+     In addition to previous allowed values, ``values`` accepts "auto-strict"
+     to fit to only observed categories as well as lists of lists of categories.
+     :issue:`7327` and :issue:`8793` by :user:`Vighnesh Birodkar <vighneshbirodkar>`
+     and :user:`Stephen Hoover <stephen-hoover>`.
+
 Bug fixes
 .........
    - Fixed a bug where :class:`sklearn.ensemble.IsolationForest` uses an
@@ -5070,4 +5080,4 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Anish Shah: https://github.com/AnishShah
 
 .. _Neeraj Gangwar: http://neerajgangwar.in
-.. _Arthur Mensch: https://amensch.fr
\ No newline at end of file
+.. _Arthur Mensch: https://amensch.fr
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index ac705501f3180..1d37521d4e786 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1629,6 +1629,8 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
         Dense array or sparse matrix.
     transform : callable
         A callable transform(X) -> X_transformed
+    dtype : dtype
+        Cast outputs to this data type
     copy : boolean, optional
         Copy X even if it could be avoided.
     selected: "all" or array of indices or mask
@@ -1678,7 +1680,7 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
 
 
 class OneHotEncoder(BaseEstimator, TransformerMixin):
-    """Encode categorical integer features using a one-hot aka one-of-K scheme.
+    """Encode categorical features using a one-hot aka one-of-K scheme.
 
     The input to this transformer should be a matrix of integers or strings,
     denoting the values taken on by categorical (discrete) features. The
@@ -1696,7 +1698,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     Parameters
     ----------
     values : 'auto', 'auto-strict', int, List[int], or List[List[objects]]
-        - 'auto' : Determine set of values from training data.
+        - 'auto' (default) : Determine set of values from training data.
             If values are integers, then allowed values will be between
             0 and the maximum value in the data.
         - 'auto-strict' : Determine set of values from the training data.
@@ -1714,16 +1716,15 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
         Non-categorical features are always stacked to the right of the matrix.
 
-    dtype : number type, default=np.float
+    dtype : number type, default=np.float64
         Desired dtype of output.
 
     sparse : boolean, default=True
         Will return sparse matrix if set True else will return an array.
 
     handle_unknown : str, 'error' or 'ignore'
-        - 'ignore': Ignore all unknown feature values.
-        - 'error': Raise an error when the value of a feature was not
-            in the original fit data (or given through ``values``).
+        Whether to raise an error or ignore if an unknown categorical
+        feature is present during transform.
 
     Attributes
     ----------
@@ -1745,8 +1746,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Examples
     --------
-    Given a dataset with three features and four samples, we let the encoder
-    find the maximum value per feature and transform the data to a binary
+    Given a dataset with two features and three samples, we let the encoder
+    find the categories in each feature and transform the data to a binary
     one-hot encoding.
 
     >>> from sklearn.preprocessing import OneHotEncoder
@@ -1863,21 +1864,22 @@ def _check_values(self, values, n_features):
             # Input "auto": determine values automatically
             if values not in ['auto', 'auto-strict']:
                 raise ValueError(error_msg)
-        elif (isinstance(values, list) or
-                isinstance(values, np.ndarray)):
+        elif isinstance(values, list) or isinstance(values, np.ndarray):
             if len(values) != n_features:
                 raise ValueError("Shape mismatch: if values is a list,"
                                  " it has to be of length (n_features).")
 
-            # All entries are arrays or lists
-            scalar_vals = [np.isscalar(val) for val in values]
-            if any(scalar_vals):
+            # All entries must be either arrays or lists here
+            if any([np.isscalar(val) for val in values]):
                 raise ValueError(error_msg)
         else:
             raise TypeError(error_msg)
 
     def _initialize_values(self):
-        """Standardize the `values` input"""
+        """Standardize the `values` input
+
+        Output is either a string or a list of arrays.
+        """
         if self.n_values is not None:
             warnings.warn('`n_values` has been renamed to `values`.'
                           'The parameter `n_values` has been deprecated '
@@ -1961,7 +1963,8 @@ def transform(self, X, y=None):
         Returns
         -------
         out : array, shape[n_samples, n_features_new]
-            `X` encoded using the one-hot scheme.
+            `X` encoded using the one-hot scheme. Will be a CSR sparse
+            array if `self.sparse` is True.
         """
         if self.handle_unknown not in ['ignore', 'error']:
             template = ("handle_unknown should be either 'error' "
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 3d361d0d8af91..f857fb943898e 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1659,8 +1659,8 @@ def test_one_hot_encoder_dtypes():
     X = np.array([['cat', 2.1, 1], ['dog', 1, 3], ['mouse', 1, 2]], dtype='O')
 
     for dtype in [np.int8, np.float, np.bool]:
-        for sparse in [True, False]:
-            oh = OneHotEncoder('auto-strict', dtype=dtype, sparse=sparse)
+        for sp in [True, False]:
+            oh = OneHotEncoder('auto-strict', dtype=dtype, sparse=sp)
             X_tr = oh.fit_transform(X)
             assert_equal(X_tr.dtype, dtype)
 

From 25250197650faa3e3ac2294eed4fdf7841f0adb7 Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 3 May 2017 19:42:53 -0500
Subject: [PATCH 31/36] Deprecate active_features_

---
 sklearn/preprocessing/data.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 1d37521d4e786..56a1ff3ada579 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1736,10 +1736,6 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
         ``one_hot_feature_index_[i]`` specifies which feature of the input
         is encoded by column `i` in the one-hot encoded array.
 
-    active_features_ : array
-        Indices for active features, meaning values that actually occur
-        in the training set. Only available when n_values is ``'auto'``.
-
     n_values_ : array of shape (n_features,)
         Number of categories per feature. Has value `0` for
         non-categorical features.
@@ -1824,7 +1820,7 @@ def fit(self, X, y=None):
         start, end = 0, 0
         for i_cat, i_feat in enumerate(np.where(categorical)[0]):
             if np.isscalar(self._values) and self._values == 'auto':
-                end = start + self.n_active_features_[i_cat]
+                end = start + self._n_active_features_[i_cat]
             else:
                 end = start + len(self._label_encoders[i_cat].classes_)
             self.feature_index_range_[i_feat] = start, end
@@ -1948,9 +1944,9 @@ def _fit(self, X):
                     this_col_mask = np.zeros(n_classes, dtype=bool)
                     this_col_mask[int_classes] = True
                     active_features.append(this_col_mask)
-            self.n_active_features_ = np.array([a.sum()
-                                                for a in active_features])
-            self.active_features_ = np.where(np.hstack(active_features))[0]
+            self._n_active_features_ = np.array([a.sum()
+                                                 for a in active_features])
+            self._active_features_ = np.where(np.hstack(active_features))[0]
 
     def transform(self, X, y=None):
         """Encode the selected categorical features using the one-hot scheme.
@@ -2022,10 +2018,19 @@ def _transform(self, X):
                                 dtype=self.dtype).tocsr()
 
         if np.isscalar(self._values) and self._values == 'auto':
-            out = out[:, self.active_features_]
+            out = out[:, self._active_features_]
 
         return out if self.sparse else out.toarray()
 
+    @property
+    def active_features_(self):
+        warnings.warn('The property `active_features_` is deprecated and'
+                      ' will be removed in version 0.21')
+        if not hasattr(self, '_active_features_'):
+            raise AttributeError("'OneHotEncoder' object has no attribute "
+                                 "'active_features_'.")
+        return self._active_features_
+
     @property
     def feature_indices_(self):
         # This is very similar to the current attribute

From 7a53fe843ca776a23796279aafac768fc56e0191 Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 3 May 2017 19:56:56 -0500
Subject: [PATCH 32/36] Switch from auto-strict to error-strict

---
 sklearn/preprocessing/data.py            | 50 +++++++++++++-----------
 sklearn/preprocessing/tests/test_data.py | 18 +++++----
 2 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 56a1ff3ada579..3a40fdb5c4dd5 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1697,12 +1697,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    values : 'auto', 'auto-strict', int, List[int], or List[List[objects]]
+    values : 'auto', int, List[int], or List[List[objects]]
         - 'auto' (default) : Determine set of values from training data.
-            If values are integers, then allowed values will be between
-            0 and the maximum value in the data.
-        - 'auto-strict' : Determine set of values from the training data.
-            Only values in the original training data are valid.
         - int : values are in ``range(values)`` for all features
         - list of ints : values for feature ``i`` are in ``range(values[i])``
         - list of lists : values for feature ``i`` are in ``values[i]``
@@ -1722,9 +1718,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     sparse : boolean, default=True
         Will return sparse matrix if set True else will return an array.
 
-    handle_unknown : str, 'error' or 'ignore'
-        Whether to raise an error or ignore if an unknown categorical
-        feature is present during transform.
+    handle_unknown : {'error', 'error-strict', 'ignore'}
+        - 'ignore': Ignore all unknown feature values.
+        - 'error': Raise an error when the value of an integer feature is more
+            than the maximum value seen during fit or less than zero, or when
+            the value of a non-integer feature was unseen during ``fit``.
+        - 'error-strict': Raise an error when the value of a feature is unseen
+            during ``fit``.
 
     Attributes
     ----------
@@ -1799,6 +1799,15 @@ def fit(self, X, y=None):
         -------
         self
         """
+        if self.handle_unknown not in ['ignore', 'error', 'error-strict']:
+            template = ("handle_unknown should be either 'error', "
+                        "'error-strict', or 'ignore', got %s")
+            raise ValueError(template % self.handle_unknown)
+        elif self.handle_unknown == 'error':
+            warnings.warn('The behavior of handle_unknown="error" is '
+                          'deprecated and will be changed to be the same '
+                          'as "error-strict" in version 0.21')
+
         X = check_array(X, dtype=None, accept_sparse='csc', copy=False)
         n_samples, n_features = X.shape
         self.n_features_ = n_features
@@ -1819,7 +1828,7 @@ def fit(self, X, y=None):
 
         start, end = 0, 0
         for i_cat, i_feat in enumerate(np.where(categorical)[0]):
-            if np.isscalar(self._values) and self._values == 'auto':
+            if np.isscalar(self._values) and self.handle_unknown == 'error':
                 end = start + self._n_active_features_[i_cat]
             else:
                 end = start + len(self._label_encoders[i_cat].classes_)
@@ -1854,11 +1863,10 @@ def _check_values(self, values, n_features):
         converted to lists of arrays before getting here.
         This should run after `_initialize_values`.
         """
-        error_msg = ("`values` should be 'auto', 'auto-strict', an integer, "
+        error_msg = ("`values` should be 'auto', an integer, "
                      "a list of integers or a list of list")
         if isinstance(values, six.string_types):
-            # Input "auto": determine values automatically
-            if values not in ['auto', 'auto-strict']:
+            if values != 'auto':
                 raise ValueError(error_msg)
         elif isinstance(values, list) or isinstance(values, np.ndarray):
             if len(values) != n_features:
@@ -1910,7 +1918,7 @@ def _fit(self, X):
         for i in range(n_features):
             le = self._label_encoders[i]
 
-            if np.isscalar(self._values) and self._values == 'auto':
+            if np.isscalar(self._values) and self.handle_unknown == 'error':
                 # For integer features, allow integers between
                 # 0 and column max. The transform will still only
                 # return dummy columns for integers present in training data.
@@ -1920,18 +1928,19 @@ def _fit(self, X):
                     if np.min(_auto_int_classes[i]) < 0:
                         msg = ('Column %s has value(s) less than zero; all '
                                'integer columns must have minimum value '
-                               '0 when value="auto".')
+                               '0 when value="auto" and '
+                               'handle_unknown="error".')
                         raise ValueError(msg)
                     n_classes = np.max(_auto_int_classes[i]) + 1
                     le.fit(np.arange(n_classes))
                 else:
                     le.fit(X[:, i])
-            elif np.isscalar(self._values) and self._values == 'auto-strict':
+            elif np.isscalar(self._values):
                 le.fit(X[:, i])
             else:
                 le.fit(self._values[i])
 
-        if np.isscalar(self._values) and self._values == 'auto':
+        if np.isscalar(self._values) and self.handle_unknown == 'error':
             # Record which integer features were present in training
             # data so we can restrict output columns.
             active_features = []
@@ -1962,11 +1971,6 @@ def transform(self, X, y=None):
             `X` encoded using the one-hot scheme. Will be a CSR sparse
             array if `self.sparse` is True.
         """
-        if self.handle_unknown not in ['ignore', 'error']:
-            template = ("handle_unknown should be either 'error' "
-                        "or 'ignore', got %s")
-            raise ValueError(template % self.handle_unknown)
-
         X = check_array(X, accept_sparse='csc', dtype=None, copy=False)
         if X.shape[1] != self.n_features_:
             raise ValueError("Input data must have %s "
@@ -1981,7 +1985,7 @@ def _transform(self, X):
         X_int = np.zeros_like(X, dtype=np.int32)
 
         # Recode all columns of input data as integers
-        if self.handle_unknown == 'error':
+        if self.handle_unknown in ['error', 'error-strict']:
             for i, le in enumerate(self._label_encoders):
                 try:
                     X_int[:, i] = le.transform(X[:, i])
@@ -2017,7 +2021,7 @@ def _transform(self, X):
                                 shape=(n_samples, indices[-1]),
                                 dtype=self.dtype).tocsr()
 
-        if np.isscalar(self._values) and self._values == 'auto':
+        if np.isscalar(self._values) and self.handle_unknown == 'error':
             out = out[:, self._active_features_]
 
         return out if self.sparse else out.toarray()
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index f857fb943898e..628db2d4f0d08 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1526,7 +1526,8 @@ def test_one_hot_encoder_attr():
     assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 1, 1, 2, 2])
     assert_array_equal(enc.n_values_, [11, 16, 2])
 
-    oh = OneHotEncoder('auto-strict', categorical_features=[True, False, True])
+    oh = OneHotEncoder('auto', handle_unknown='error-strict',
+                       categorical_features=[True, False, True])
     oh.fit(X)
     assert_array_equal(oh.feature_index_range_, [[0, 3], [5, 6], [3, 5]])
     assert_array_equal(oh.one_hot_feature_index_, [0, 0, 0, 2, 2, 1])
@@ -1660,7 +1661,8 @@ def test_one_hot_encoder_dtypes():
 
     for dtype in [np.int8, np.float, np.bool]:
         for sp in [True, False]:
-            oh = OneHotEncoder('auto-strict', dtype=dtype, sparse=sp)
+            oh = OneHotEncoder('auto', handle_unknown='error-strict',
+                               dtype=dtype, sparse=sp)
             X_tr = oh.fit_transform(X)
             assert_equal(X_tr.dtype, dtype)
 
@@ -1681,8 +1683,8 @@ def test_one_hot_encoder_unknown_transform_int():
     y = [[0, 1, 1]]
     assert_array_equal(toarray(oh.transform(y)), [[1,  0,  0,  0,  1,  0,  0]])
 
-    # But we do error when fit with "auto-strict"
-    oh = OneHotEncoder(values='auto-strict', handle_unknown='error')
+    # But we do error when set to "error-strict"
+    oh = OneHotEncoder(values='auto', handle_unknown='error-strict')
     oh.fit(X)
     assert_raises(ValueError, oh.transform, y)
 
@@ -1713,8 +1715,8 @@ def test_one_hot_encoder_unknown_transform_object():
     assert_array_equal(X, X_orig)
 
     # Raise error if handle_unknown is neither ignore nor error.
-    oh = OneHotEncoder(handle_unknown='42').fit(X)
-    assert_raises(ValueError, oh.transform, y)
+    oh = OneHotEncoder(handle_unknown='42')
+    assert_raises(ValueError, oh.fit, X)
     assert_array_equal(X, X_orig)
 
     # Check that in-range integer features are okay in object arrays
@@ -1727,8 +1729,8 @@ def test_one_hot_encoder_unknown_transform_object():
     oh = OneHotEncoder(handle_unknown='error').fit(X)
     assert_raises(ValueError, oh.transform, y)
 
-    # A transform on in-range integers errors in 'auto-strict' mode.
-    oh = OneHotEncoder(values='auto-strict', handle_unknown='error').fit(X)
+    # A transform on in-range integers errors in 'error-strict' mode.
+    oh = OneHotEncoder(values='auto', handle_unknown='error-strict').fit(X)
     assert_raises(ValueError, oh.transform, y)
 
 

From 05af448450ed3a494493d9b67a0d3e27d8b98de5 Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 3 May 2017 20:04:48 -0500
Subject: [PATCH 33/36] Deprecate integer and list of integer inputs to
 `values`

---
 sklearn/preprocessing/data.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 3a40fdb5c4dd5..c120be65bb73f 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1697,10 +1697,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    values : 'auto', int, List[int], or List[List[objects]]
+    values : 'auto' or List[List[objects]]
         - 'auto' (default) : Determine set of values from training data.
-        - int : values are in ``range(values)`` for all features
-        - list of ints : values for feature ``i`` are in ``range(values[i])``
         - list of lists : values for feature ``i`` are in ``values[i]``
 
     categorical_features : "all" or array of indices or mask
@@ -1887,9 +1885,9 @@ def _initialize_values(self):
         if self.n_values is not None:
             warnings.warn('`n_values` has been renamed to `values`.'
                           'The parameter `n_values` has been deprecated '
-                          'and will be removed in version 0.21, use the'
+                          'and will be removed in version 0.21; use the '
                           'parameter `values` instead and specify the '
-                          'expected values for each feature')
+                          'expected values for each feature.')
             values = self.n_values
         else:
             values = self.values
@@ -1897,9 +1895,15 @@ def _initialize_values(self):
         # Convert `int` and `Sequence[int]` inputs to `List[Array[int]]`
         if (not isinstance(values, six.string_types) and
                 np.isscalar(values)):
+            warnings.warn('Integer input to `values` is deprecated and'
+                          ' will be removed in version 0.21. Specify a '
+                          'list of allowed values for each feature instead.')
             values = np.ones(self.n_features_cat_, dtype=int) * values
         if (not isinstance(values, six.string_types) and
                 np.isscalar(values[0])):
+            warnings.warn('List of integer input to `values` is deprecated and'
+                          ' will be removed in version 0.21. Specify a '
+                          'list of allowed values for each feature instead.')
             values = [np.arange(v, dtype=np.int) for v in values]
 
         return values

From d9d77aed69a4c58b640f4aa500bc189101c2275a Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 3 May 2017 21:58:25 -0500
Subject: [PATCH 34/36] Address CR

---
 doc/modules/preprocessing.rst            | 24 ++++-----
 doc/whats_new.rst                        | 23 ++++++---
 sklearn/preprocessing/data.py            | 62 +++++++++++++++---------
 sklearn/preprocessing/tests/test_data.py | 58 ++++++++++++++++++----
 4 files changed, 116 insertions(+), 51 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index cf5c312eb5e06..bf76d499b464b 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -378,10 +378,10 @@ Encoding categorical features
 Often features are not given as continuous values but categorical.
 For example a person could have features ``["male", "female"]``,
 ``["from Europe", "from US", "from Asia"]``,
-``["uses Firefox", "uses Chrome", "uses Safari", "uses Internet Explorer"]``.
+``["Firefox", "Chrome", "Safari", "Internet Explorer"]``.
 Such features can be efficiently coded as integers, for instance
-``["male", "from US", "uses Internet Explorer"]`` could be expressed as
-``[0, 1, 3]`` while ``["female", "from Asia", "uses Chrome"]`` would be
+``["male", "from US", "Internet Explorer"]`` could be expressed as
+``[0, 1, 3]`` while ``["female", "from Asia", "Chrome"]`` would be
 ``[1, 2, 1]``.
 
 Such integer representation can not be used directly with scikit-learn estimators, as these
@@ -397,13 +397,13 @@ only one active.
 Continuing the example above::
 
   >>> enc = preprocessing.OneHotEncoder()
-  >>> enc.fit([['female', 'from US', 'uses Chrome'],
-  ... ['male', 'from Asia', 'uses Firefox']])  \
+  >>> enc.fit([['female', 'from US', 'Chrome'],
+  ... ['male', 'from Asia', 'Firefox']])  \
   ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
   OneHotEncoder(categorical_features='all',
          dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
          sparse=True, values='auto')
-  >>> enc.transform([['female', 'from Asia', 'uses Firefox']]).toarray()
+  >>> enc.transform([['female', 'from Asia', 'Firefox']]).toarray()
   array([[ 1.,  0.,  1.,  0.,  0.,  1.]])
 
 By default, how many values each feature can take is inferred automatically from the dataset.
@@ -417,21 +417,21 @@ values are the continents and the last values are web browsers.
 Note that, if there is a possibilty that the training data might have missing categorical
 features, one has to explicitly set ``n_values``. For example,
 
-    >>> browsers = ['uses Internet Explorer', 'uses Chrome' , 'uses Safari', 'uses Firefox']
+    >>> browsers = ['Internet Explorer', 'Chrome' , 'Safari', 'Firefox']
     >>> genders = ['male', 'female']
     >>> locations = ['from Europe', 'from Asia', 'from US']
     >>> enc = preprocessing.OneHotEncoder(values=[genders, locations, browsers])
-    >>> # Note that for there are missing categorical values for the 2nd and 3rd
-    >>> # feature
-    >>> enc.fit([['female', 'from US', 'uses Chrome'],
-    ... ['male', 'from Asia', 'uses Internet Explorer']]) \
+    >>> # Note that for there are missing categorical values for the
+    >>> # 2nd and 3rd feature
+    >>> enc.fit([['female', 'from US', 'Chrome'],
+    ... ['male', 'from Asia', 'Internet Explorer']]) \
     ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
     OneHotEncoder(categorical_features='all',
            dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
            sparse=True,
            values=[...])
 
-    >>> enc.transform([['male', 'from Europe', 'uses Safari']]).toarray()
+    >>> enc.transform([['male', 'from Europe', 'Safari']]).toarray()
     array([[ 0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])
 
 See :ref:`dict_feature_extraction` for categorical features that are represented
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index d86f5fa0cc7ed..bd3a5def36675 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -173,13 +173,13 @@ Enhancements
 
    - :class:`preprocessing.OneHotEncoder` now fits and transforms inputs of
      any numerical or string type instead of only integer arrays.
-     It has addtional fitted attributes ``feature_index_range_`` and
-     ``one_hot_feature_index_``. The ``feature_indices_`` has been deprecated.
-     The ``n_values`` parameter is deprecated in favor of ``values``.
-     In addition to previous allowed values, ``values`` accepts "auto-strict"
-     to fit to only observed categories as well as lists of lists of categories.
-     :issue:`7327` and :issue:`8793` by :user:`Vighnesh Birodkar <vighneshbirodkar>`
-     and :user:`Stephen Hoover <stephen-hoover>`.
+     It has addtional fitted attributes ``feature_index_range_``,
+     ``one_hot_feature_index_``, and ``categories_``.
+     In addition to previous allowed values, ``handle_unknown`` accepts "error-strict"
+     to error if any unknown values are seen during tranformation.
+     :issue:`7327` and :issue:`8793` by
+     :user:`Vighnesh Birodkar <vighneshbirodkar>` and
+     :user:`Stephen Hoover <stephen-hoover>`.
 
 Bug fixes
 .........
@@ -339,6 +339,15 @@ API changes summary
      the weighted impurity decrease from splitting is no longer alteast
      ``min_impurity_decrease``.  :issue:`8449` by `Raghav RV_`
 
+   - In :class:`preprocessing.OneHotEncoder`, deprecate the
+     ``feature_indices_`` and ``active_features_`` attributes.
+     Deprecate integer and list of integer inputs to ``values``
+     in favor of lists of lists of categories.
+     The present behavior of ``handle_unknown="error"`` will
+     change to be the same as ``handle_unknown="error-strict"`` in v0.21.
+     :issue:`7327` and :issue:`8793` by
+     :user:`Vighnesh Birodkar <vighneshbirodkar>` and
+     :user:`Stephen Hoover <stephen-hoover>`.
 
 .. _changes_0_18_1:
 
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index c120be65bb73f..d02b9015408f7 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1698,7 +1698,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     Parameters
     ----------
     values : 'auto' or List[List[objects]]
-        - 'auto' (default) : Determine set of values from training data.
+        - 'auto' (default) : Encoded values are those found in training data.
         - list of lists : values for feature ``i`` are in ``values[i]``
 
     categorical_features : "all" or array of indices or mask
@@ -1726,16 +1726,20 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
-    feature_index_range_ : array, shape [n_feature, 2]
+    feature_index_range_ : array, shape (n_feature, 2)
         ``feature_index_range_[i]`` specifies the range of column indices
         occupied by the input feature `i` in the one-hot encoded array.
 
-    one_hot_feature_index_ : array, shape [n_features_new]
+    one_hot_feature_index_ : array, shape (n_features_new,)
         ``one_hot_feature_index_[i]`` specifies which feature of the input
-        is encoded by column `i` in the one-hot encoded array.
+        is encoded by column ``i`` in the one-hot encoded array.
+
+    categories_ : array, shape (n_features_new,)
+        np.object array containing the category encoded in each feature
+        of the output (or None for non-categorical features)
 
     n_values_ : array of shape (n_features,)
-        Number of categories per feature. Has value `0` for
+        Number of encoded categories per feature. Has value `0` for
         non-categorical features.
 
     Examples
@@ -1752,12 +1756,14 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
            dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
            sparse=True, values='auto')
     >>> enc.n_values_
-    array([ 3, 18])
+    array([ 3, 3])
     >>> enc.feature_index_range_
     array([[0, 3],
            [3, 6]])
     >>> enc.one_hot_feature_index_
     array([0, 0, 0, 1, 1, 1])
+    >>> enc.categories_
+    array(['cat', 'dog', 'mouse', 4, 15, 17], dtype=object)
     >>> enc.transform([['dog', 4]]).toarray()
     array([[ 0.,  1.,  0.,  1.,  0.,  0.]])
 
@@ -1804,7 +1810,7 @@ def fit(self, X, y=None):
         elif self.handle_unknown == 'error':
             warnings.warn('The behavior of handle_unknown="error" is '
                           'deprecated and will be changed to be the same '
-                          'as "error-strict" in version 0.21')
+                          'as "error-strict" in version 0.21', FutureWarning)
 
         X = check_array(X, dtype=None, accept_sparse='csc', copy=False)
         n_samples, n_features = X.shape
@@ -1832,24 +1838,29 @@ def fit(self, X, y=None):
                 end = start + len(self._label_encoders[i_cat].classes_)
             self.feature_index_range_[i_feat] = start, end
             start = end
-        num_cat = np.sum(categorical)
-        non_cat_indices = np.arange(start, start + n_features - num_cat)
+        num_cat_cols = np.sum(categorical)
+        non_cat_indices = np.arange(start, start + n_features - num_cat_cols)
         self.feature_index_range_[~categorical, 0] = non_cat_indices
         self.feature_index_range_[~categorical, 1] = non_cat_indices + 1
 
         # Record which column of input data corresponds
         # to each column of output data
-        n_expanded_cols = end + n_features - num_cat
-        self.one_hot_feature_index_ = np.empty(n_expanded_cols, dtype=np.int)
-        for i in range(n_features):
-            s, e = self.feature_index_range_[i]
-            self.one_hot_feature_index_[s:e] = i
+        n_cats = np.diff(self.feature_index_range_, axis=1).ravel()
+        inp_order = np.argsort(self.feature_index_range_[:, 0])
+        self.one_hot_feature_index_ = np.repeat(inp_order, n_cats[inp_order])
 
         # Count categories per feature
-        n_val = len(non_cat_indices) * [0]
-        if hasattr(self, '_label_encoders'):
-            n_val = [len(le.classes_) for le in self._label_encoders] + n_val
-        self.n_values_ = np.array(n_val)
+        self.n_values_ = n_cats.copy()
+        self.n_values_[~categorical] = 0
+
+        # Store categories for each output feature
+        if num_cat_cols == 0:
+            cats = []
+        else:
+            cats = np.concatenate([le.classes_ for le in self._label_encoders])
+        if hasattr(self, '_active_features_'):
+            cats = cats[self._active_features_]
+        self.categories_ = np.hstack([cats, len(non_cat_indices) * [None]])
 
         return self
 
@@ -1887,7 +1898,7 @@ def _initialize_values(self):
                           'The parameter `n_values` has been deprecated '
                           'and will be removed in version 0.21; use the '
                           'parameter `values` instead and specify the '
-                          'expected values for each feature.')
+                          'expected values for each feature.', FutureWarning)
             values = self.n_values
         else:
             values = self.values
@@ -1897,13 +1908,15 @@ def _initialize_values(self):
                 np.isscalar(values)):
             warnings.warn('Integer input to `values` is deprecated and'
                           ' will be removed in version 0.21. Specify a '
-                          'list of allowed values for each feature instead.')
+                          'list of allowed values for each feature instead.',
+                          FutureWarning)
             values = np.ones(self.n_features_cat_, dtype=int) * values
         if (not isinstance(values, six.string_types) and
                 np.isscalar(values[0])):
             warnings.warn('List of integer input to `values` is deprecated and'
                           ' will be removed in version 0.21. Specify a '
-                          'list of allowed values for each feature instead.')
+                          'list of allowed values for each feature instead.',
+                          FutureWarning)
             values = [np.arange(v, dtype=np.int) for v in values]
 
         return values
@@ -2033,7 +2046,7 @@ def _transform(self, X):
     @property
     def active_features_(self):
         warnings.warn('The property `active_features_` is deprecated and'
-                      ' will be removed in version 0.21')
+                      ' will be removed in version 0.21', FutureWarning)
         if not hasattr(self, '_active_features_'):
             raise AttributeError("'OneHotEncoder' object has no attribute "
                                  "'active_features_'.")
@@ -2045,6 +2058,9 @@ def feature_indices_(self):
         # `feature_index_range_`, but only applies to the
         # subset of categorical features.
         warnings.warn('The property `feature_indices_` is deprecated and'
-                      ' will be removed in version 0.21')
+                      ' will be removed in version 0.21', FutureWarning)
+        if not hasattr(self, '_label_encoders'):
+            raise AttributeError("'OneHotEncoder' object has no attribute "
+                                 "'feature_indices_'.")
         n_categories = [len(le.classes_) for le in self._label_encoders]
         return np.cumsum([0] + n_categories)
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 628db2d4f0d08..d526b842c961d 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -26,6 +26,7 @@
 from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_false
+from sklearn.utils.testing import assert_warns
 from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import assert_no_warnings
 from sklearn.utils.testing import assert_allclose
@@ -1507,7 +1508,7 @@ def test_one_hot_encoder_sparse():
     assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)
 
 
-def test_one_hot_encoder_error_on_negative():
+def test_one_hot_encoder_with_negative_integers():
     # Negative numerical values in inputs should raise an exception
     X_bad = np.array([[-1, "cat"], [10, "mouse"], [5, "cat"]], dtype=np.object)
     X_good = np.array([[1, "cat"], [10, "mouse"], [5, "cat"]], dtype=np.object)
@@ -1516,6 +1517,9 @@ def test_one_hot_encoder_error_on_negative():
     ohe = OneHotEncoder().fit(X_good)
     assert_raises(ValueError, ohe.transform, X_bad)
 
+    # Negative values are okay with "error-strict"
+    OneHotEncoder(handle_unknown='error-strict').fit_transform(X_bad)
+
 
 def test_one_hot_encoder_attr():
     X = np.array([[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]], dtype='O')
@@ -1524,20 +1528,56 @@ def test_one_hot_encoder_attr():
     enc.fit(X)
     assert_array_equal(enc.feature_index_range_, [[0, 3], [3, 5], [5, 7]])
     assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 1, 1, 2, 2])
-    assert_array_equal(enc.n_values_, [11, 16, 2])
+    assert_array_equal(enc.n_values_, [3, 2, 2])
+    assert_array_equal(enc.categories_,
+                       np.array([1, 5, 10, 7, 15, 'cat', 'mouse'], dtype='O'))
 
-    oh = OneHotEncoder('auto', handle_unknown='error-strict',
-                       categorical_features=[True, False, True])
-    oh.fit(X)
-    assert_array_equal(oh.feature_index_range_, [[0, 3], [5, 6], [3, 5]])
-    assert_array_equal(oh.one_hot_feature_index_, [0, 0, 0, 2, 2, 1])
-    assert_array_equal(oh.n_values_, [3, 2, 0])
+    enc = OneHotEncoder('auto', handle_unknown='error-strict',
+                        categorical_features=[True, False, True])
+    enc.fit(X)
+    assert_array_equal(enc.feature_index_range_, [[0, 3], [5, 6], [3, 5]])
+    assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 2, 2, 1])
+    assert_array_equal(enc.n_values_, [3, 0, 2])
+    assert_array_equal(enc.categories_,
+                       np.array([1, 5, 10, 'cat', 'mouse', None], dtype='O'))
 
     enc = OneHotEncoder(categorical_features=[False, False, True])
     enc.fit(X)
     assert_array_equal(enc.feature_index_range_, [[2, 3], [3, 4], [0, 2]])
     assert_array_equal(enc.one_hot_feature_index_, [2, 2, 0, 1])
-    assert_array_equal(enc.n_values_, [2, 0, 0])
+    assert_array_equal(enc.n_values_, [0, 0, 2])
+    assert_array_equal(enc.categories_,
+                       np.array(['cat', 'mouse', None, None], dtype='O'))
+
+
+def test_one_hot_encoder_deprecations():
+    # Check that deprecated features raise warnings
+    X = [[3, 2, 1], [0, 1, 1]]
+
+    # `handle_unknown`="error" will change in v0.21
+    ohe = OneHotEncoder(handle_unknown='error')
+    assert_warns(FutureWarning, ohe.fit, X)
+
+    # `n_values` is deprecated
+    ohe = OneHotEncoder(n_values='auto', handle_unknown='ignore')
+    assert_warns(FutureWarning, ohe.fit, X)
+
+    # Integer input for `values` is deprecated
+    ohe = OneHotEncoder(values=5, handle_unknown='ignore')
+    assert_warns(FutureWarning, ohe.fit, X)
+
+    # List of integer input for `values` is deprecated
+    ohe = OneHotEncoder(values=[5, 5, 5], handle_unknown='ignore')
+    assert_warns(FutureWarning, ohe.fit, X)
+
+    # `active_features_` is deprecated (and is only available
+    # when `handle_unknown`="error")
+    ohe = OneHotEncoder(handle_unknown='error').fit(X)
+    assert_warns(FutureWarning, getattr, ohe, 'active_features_')
+
+    # `feature_indices_` is deprecated
+    ohe = OneHotEncoder(handle_unknown='ignore').fit(X)
+    assert_warns(FutureWarning, getattr, ohe, 'feature_indices_')
 
 
 def test_one_hot_encoder_dense():

From 840382e49d280fe1db46bb48b6a66bc305159eb8 Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 3 May 2017 22:21:15 -0500
Subject: [PATCH 35/36] Fix whitespace in doc test

---
 sklearn/preprocessing/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index d02b9015408f7..bbe8f54c260b1 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1756,7 +1756,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
            dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None,
            sparse=True, values='auto')
     >>> enc.n_values_
-    array([ 3, 3])
+    array([3, 3])
     >>> enc.feature_index_range_
     array([[0, 3],
            [3, 6]])

From ff4b30bfc27b931c219ed13b466afab66b4f8aff Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 3 May 2017 22:56:50 -0500
Subject: [PATCH 36/36] Fix doctest for Python 2.7

---
 sklearn/preprocessing/data.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index bbe8f54c260b1..ee21c6726e620 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1762,8 +1762,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
            [3, 6]])
     >>> enc.one_hot_feature_index_
     array([0, 0, 0, 1, 1, 1])
-    >>> enc.categories_
-    array(['cat', 'dog', 'mouse', 4, 15, 17], dtype=object)
+    >>> (enc.categories_ ==
+    ...  np.array(['cat', 'dog', 'mouse', 4, 15, 17], dtype='O')).all()
+    True
     >>> enc.transform([['dog', 4]]).toarray()
     array([[ 0.,  1.,  0.,  1.,  0.,  0.]])