From ea98484484fe76d55a642e58aaa8ec8013ff06b7 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Tue, 29 Mar 2016 22:30:15 -0400 Subject: [PATCH 01/36] Refactored OneHotEncoder to work with strings --- doc/modules/preprocessing.rst | 34 ++- sklearn/preprocessing/data.py | 346 +++++++++++++---------- sklearn/preprocessing/tests/test_data.py | 66 +++-- 3 files changed, 264 insertions(+), 182 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 709239687158e..24df41f2966fa 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -397,31 +397,37 @@ only one active. Continuing the example above:: >>> enc = preprocessing.OneHotEncoder() - >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS + >>> enc.fit([['female', 'from US', 'uses Chrome'], + ... ['male', 'from Asia', 'uses Firefox']]) # doctest: +ELLIPSIS OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values='auto', sparse=True) - >>> enc.transform([[0, 1, 3]]).toarray() - array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]]) + handle_unknown='error', n_values=None, sparse=True, values='auto') + >>> enc.transform([['female', 'from Asia', 'uses Firefox']]).toarray() + array([[ 1., 0., 1., 0., 0., 1.]]) By default, how many values each feature can take is inferred automatically from the dataset. -It is possible to specify this explicitly using the parameter ``n_values``. +It is possible to specify this explicitly using the parameter ``xvalues``. There are two genders, three possible continents and four web browsers in our dataset. Then we fit the estimator, and transform a data point. -In the result, the first two numbers encode the gender, the next set of three -numbers the continent and the last four the web browser. +In the result, the first two values are genders, the next set of three +values are the continents and the last values are web browsers. Note that, if there is a possibilty that the training data might have missing categorical features, one has to explicitly set ``n_values``. For example, - >>> enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4]) - >>> # Note that there are missing categorical values for the 2nd and 3rd - >>> # features - >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS + >>> browsers = ['uses Internet Explorer', 'uses Chrome' , 'uses Safari', 'uses Firefox'] + >>> genders = ['male', 'female'] + >>> locations = ['from Europe', 'from Asia', 'from US'] + >>> enc = preprocessing.OneHotEncoder(values=[genders, locations, browsers]) + >>> # Note that for there are missing categorical values for the 2nd and 3rd + >>> # feature + >>> enc.fit([['female', 'from US', 'uses Chrome'], + ... ['male', 'from Asia', 'uses Internet Explorer']]) # doctest: +ELLIPSIS OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values=[2, 3, 4], sparse=True) - >>> enc.transform([[1, 0, 0]]).toarray() - array([[ 0., 1., 1., 0., 0., 1., 0., 0., 0.]]) + handle_unknown='error', n_values=None, sparse=True, + values=[...]) + >>> enc.transform([['male', 'from Europe', 'uses Safari']]).toarray() + array([[ 0., 1., 0., 1., 0., 0., 0., 0., 1.]]) See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as integers. diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 093137d078000..1fe0741d9db13 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -26,6 +26,8 @@ mean_variance_axis, incr_mean_variance_axis, min_max_axis) from ..utils.validation import check_is_fitted, FLOAT_DTYPES +from .label import LabelEncoder +from ..utils.fixes import np_version zip = six.moves.zip @@ -1618,28 +1620,29 @@ def add_dummy_feature(X, value=1.0): return np.hstack((np.ones((n_samples, 1)) * value, X)) -def _transform_selected(X, transform, selected="all", copy=True): - """Apply a transform function to portion of selected features - +def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, + return_val=True): + """Apply a function to portion of selected features Parameters ---------- - X : {array-like, sparse matrix}, shape [n_samples, n_features] + X : {array, sparse matrix}, shape [n_samples, n_features] Dense array or sparse matrix. - transform : callable A callable transform(X) -> X_transformed - copy : boolean, optional Copy X even if it could be avoided. - selected: "all" or array of indices or mask Specify which features to apply the transform to. - + return_val : boolean, optional + Whether to return the transformed matrix. If not set `None` is + returned. Returns ------- - X : array or sparse matrix, shape=(n_samples, n_features_new) + X : array or sparse matrix, shape=(n_samples, n_features_new) """ - X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES) + + if copy: + X = X.copy() if isinstance(selected, six.string_types) and selected == "all": return transform(X) @@ -1662,22 +1665,22 @@ def _transform_selected(X, transform, selected="all", copy=True): return transform(X) else: X_sel = transform(X[:, ind[sel]]) - X_not_sel = X[:, ind[not_sel]] + X_not_sel = X[:, ind[not_sel]].astype(dtype) - if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): - return sparse.hstack((X_sel, X_not_sel)) - else: - return np.hstack((X_sel, X_not_sel)) + if return_val: + if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): + return sparse.hstack((X_sel, X_not_sel)) + else: + return np.hstack((X_sel, X_not_sel)) class OneHotEncoder(BaseEstimator, TransformerMixin): """Encode categorical integer features using a one-hot aka one-of-K scheme. - The input to this transformer should be a matrix of integers, denoting - the values taken on by categorical (discrete) features. The output will be - a sparse matrix where each column corresponds to one possible value of one - feature. It is assumed that input features take on values in the range - [0, n_values). + The input to this transformer should be a matrix of integers or strings, + denoting the values taken on by categorical (discrete) features. The + output will be a sparse matrix where each column corresponds to one + possible value of one feature. This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. @@ -1689,15 +1692,11 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - n_values : 'auto', int or array of ints - Number of values per feature. - - - 'auto' : determine value range from training data. - - int : number of categorical values per feature. - Each feature value should be in ``range(n_values)`` - - array : ``n_values[i]`` is the number of categorical values in - ``X[:, i]``. Each feature value should be - in ``range(n_values[i])`` + values : 'auto', int, list of ints, or list of lists of objects + - 'auto' : determine set of values from training data. + - int : values are in ``range(values)`` for all features + - list of ints : values for feature ``i`` are in ``range(values[i])`` + - list of lists : values for feature ``i`` are in ``values[i]`` categorical_features : "all" or array of indices or mask Specify what features are treated as categorical. @@ -1720,18 +1719,11 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Attributes ---------- - active_features_ : array - Indices for active features, meaning values that actually occur - in the training set. Only available when n_values is ``'auto'``. - - feature_indices_ : array of shape (n_features,) - Indices to feature ranges. - Feature ``i`` in the original data is mapped to features - from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` - (and then potentially masked by `active_features_` afterwards) - - n_values_ : array of shape (n_features,) - Maximum number of values per feature. + label_encoders_ : list of size n_features. + The :class:`sklearn.preprocessing.LabelEncoder` objects used to encode + the features. ``self.label_encoders[i]_`` is the LabelEncoder object + used to encode the ith column. The unique features found on column + ``i`` can be accessed using ``self.label_encoders_[i].classes_``. Examples -------- @@ -1741,16 +1733,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import OneHotEncoder >>> enc = OneHotEncoder() - >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ -[1, 0, 2]]) # doctest: +ELLIPSIS + >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) # doctest: +ELLIPSIS OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values='auto', sparse=True) - >>> enc.n_values_ - array([2, 3, 4]) - >>> enc.feature_indices_ - array([0, 2, 5, 9]) - >>> enc.transform([[0, 1, 1]]).toarray() - array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]]) + handle_unknown='error', n_values=None, sparse=True, values='auto') + >>> list(enc.label_encoders_[0].classes_) + ['cat', 'dog', 'mouse'] + >>> enc.transform([['dog', 4]]).toarray() + array([[ 0., 1., 0., 1., 0., 0.]]) See also -------- @@ -1766,138 +1755,207 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 and n_classes-1. """ - def __init__(self, n_values="auto", categorical_features="all", - dtype=np.float64, sparse=True, handle_unknown='error'): - self.n_values = n_values + + def __init__(self, categorical_features="all", n_values=None, + values='auto', dtype=np.float64, sparse=True, + handle_unknown='error'): self.categorical_features = categorical_features self.dtype = dtype self.sparse = sparse self.handle_unknown = handle_unknown + self.n_values = n_values + self.values = values def fit(self, X, y=None): - """Fit OneHotEncoder to X. + """Fit the CategoricalEncoder to X. Parameters ---------- X : array-like, shape [n_samples, n_feature] - Input array of type int. + Array of ints or strings or both. Returns ------- self """ - self.fit_transform(X) + + X = check_array(X, dtype=np.object, accept_sparse='csc') + n_samples, n_features = X.shape + + _apply_selected(X, self._fit, dtype=self.dtype, + selected=self.categorical_features, copy=True, + return_val=False) return self - def _fit_transform(self, X): - """Assumes X contains only categorical features.""" - X = check_array(X, dtype=np.int) - if np.any(X < 0): - raise ValueError("X needs to contain only non-negative integers.") + def _fit(self, X): + "Assumes `X` contains only cetergorical features." + + X = check_array(X, dtype=np.object) n_samples, n_features = X.shape - if (isinstance(self.n_values, six.string_types) and - self.n_values == 'auto'): - n_values = np.max(X, axis=0) + 1 - elif isinstance(self.n_values, numbers.Integral): - if (np.max(X, axis=0) >= self.n_values).any(): - raise ValueError("Feature out of bounds for n_values=%d" - % self.n_values) - n_values = np.empty(n_features, dtype=np.int) - n_values.fill(self.n_values) - else: - try: - n_values = np.asarray(self.n_values, dtype=int) - except (ValueError, TypeError): - raise TypeError("Wrong type for parameter `n_values`. Expected" - " 'auto', int or array of ints, got %r" - % type(X)) - if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: - raise ValueError("Shape mismatch: if n_values is an array," - " it has to be of shape (n_features,).") - - self.n_values_ = n_values - n_values = np.hstack([[0], n_values]) - indices = np.cumsum(n_values) - self.feature_indices_ = indices - column_indices = (X + indices[:-1]).ravel() - row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - n_features) - data = np.ones(n_samples * n_features) - out = sparse.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() + self._n_features = n_features + self.label_encoders_ = [LabelEncoder() for i in range(n_features)] - if (isinstance(self.n_values, six.string_types) and - self.n_values == 'auto'): - mask = np.array(out.sum(axis=0)).ravel() != 0 - active_features = np.where(mask)[0] - out = out[:, active_features] - self.active_features_ = active_features + if self.n_values is not None: + warnings.warn('The parameter `n_values` is deprecated, use the' + 'parameter `classes_` instead and specify the ' + 'expected values for each feature') - return out if self.sparse else out.toarray() + if isinstance(self.n_values, numbers.Integral): + if (np.max(X, axis=0) >= self.n_values).any(): + raise ValueError("Feature out of bounds for n_values=%d" + % self.n_values) + self.values = self.n_values + else: + try: + n_values = np.asarray(self.n_values, dtype=int) + except (ValueError, TypeError): + raise TypeError("Wrong type for parameter `n_values`." + " Expected 'auto', int or array of ints," + "got %r" % type(X)) + if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if n_values is an array," + " it has to be of shape (n_features,).") + self.values = list(self.n_values) + + error_msg = ("`values` should be 'auto', an integer, a list of" + " integers or a list of list") + + for i in range(n_features): + le = self.label_encoders_[i] + if self.values == 'auto': + le.fit(X[:, i]) + elif isinstance(self.values, numbers.Integral): + if (np.max(X, axis=0) >= self.values).any(): + raise ValueError("Feature out of bounds for n_values=%d" + % self.values) + le.fit(np.arange(self.values, dtype=np.int)) + elif isinstance(self.values, list): + if len(self.values) != X.shape[1]: + raise ValueError("Shape mismatch: if n_values is a list," + " it has to be of length (n_features).") + if isinstance(self.values[i], list): + le.fit(self.values[i]) + elif isinstance(self.values[i], numbers.Integral): + le.fit(np.arange(self.values[i], dtype=np.int)) + else: + raise ValueError(error_msg) + else: + raise ValueError(error_msg) + + def transform(self, X, y=None): + """Encode the selected categorical features using the one-hot scheme. - def fit_transform(self, X, y=None): - """Fit OneHotEncoder to X, then transform X. + Parameters + ---------- + X : array-like, shape [n_samples, n_feature] + Array of ints or strings or both. - Equivalent to self.fit(X).transform(X), but more convenient and more - efficient. See fit for the parameters, transform for the return value. + Returns + ------- + out : array, shape[n_samples, n_features_new] + `X` encoded using the one-hot scheme. """ - return _transform_selected(X, self._fit_transform, - self.categorical_features, copy=True) + X = check_array(X, dtype=np.object) + + return _apply_selected(X, self._transform, copy=True, + selected=self.categorical_features) def _transform(self, X): - """Assumes X contains only categorical features.""" - X = check_array(X, dtype=np.int) - if np.any(X < 0): - raise ValueError("X needs to contain only non-negative integers.") + "Assumes `X` contains only categorical features." + + X = check_array(X, accept_sparse='csc', dtype=np.object) n_samples, n_features = X.shape + X_int = np.zeros_like(X, dtype=np.int) + X_mask = np.ones_like(X, dtype=np.bool) + + for i in range(n_features): + if np_version < (1, 8): + # in1d is not supported for object datatype in np < 1.8 + valid_mask = np.ones_like(X[:, i], dtype=np.bool) + found_classes = set(np.unique(X[:, i])) + valid_classes = set(self.label_encoders_[i].classes_) + invalid_classes = found_classes - valid_classes + + for item in invalid_classes: + mask = X[:, i] == item + np.logical_not(mask, mask) + np.logical_and(valid_mask, mask, valid_mask) + + else: + valid_mask = np.in1d(X[:, i], self.label_encoders_[i].classes_) + + if not np.all(valid_mask): + + if self.handle_unknown == 'error': + if np_version < (1, 8): + valid_classes = set(self.label_encoders_[i].classes_) + diff = set(X[:, i]) - valid_classes + diff = list(diff) + else: + diff = np.setdiff1d(X[:, i], + self.label_encoders_[i].classes_) + msg = 'Unknown feature(s) %s in column %d' % (diff, i) + raise ValueError(msg) + elif self.handle_unknown == 'ignore': + # Set the problematic rows to an acceptable value and + # continue `The rows are marked in `X_mask` and will be + # removed later. + X_mask[:, i] = valid_mask + X[:, i][~valid_mask] = self.label_encoders_[i].classes_[0] + else: + template = ("handle_unknown should be either 'error' or " + "'ignore', got %s") + raise ValueError(template % self.handle_unknown) + + X_int[:, i] = self.label_encoders_[i].transform(X[:, i]) - indices = self.feature_indices_ - if n_features != indices.shape[0] - 1: - raise ValueError("X has different shape than during fitting." - " Expected %d, got %d." - % (indices.shape[0] - 1, n_features)) - - # We use only those categorical features of X that are known using fit. - # i.e lesser than n_values_ using mask. - # This means, if self.handle_unknown is "ignore", the row_indices and - # col_indices corresponding to the unknown categorical feature are - # ignored. - mask = (X < self.n_values_).ravel() - if np.any(~mask): - if self.handle_unknown not in ['error', 'ignore']: - raise ValueError("handle_unknown should be either error or " - "unknown got %s" % self.handle_unknown) - if self.handle_unknown == 'error': - raise ValueError("unknown categorical feature present %s " - "during transform." % X.ravel()[~mask]) - - column_indices = (X + indices[:-1]).ravel()[mask] + mask = X_mask.ravel() + n_values = [le.classes_.shape[0] for le in self.label_encoders_] + n_values = np.hstack([[0], n_values]) + indices = np.cumsum(n_values) + + column_indices = (X_int + indices[:-1]).ravel()[mask] row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features)[mask] - data = np.ones(np.sum(mask)) + data = np.ones(n_samples * n_features)[mask] + out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() + if (isinstance(self.n_values, six.string_types) and self.n_values == 'auto'): out = out[:, self.active_features_] return out if self.sparse else out.toarray() - def transform(self, X): - """Transform X using one-hot encoding. + @property + def active_features_(self): + warnings.warn('The property `active_features_` is deprecated and' + ' will be removed in version 0.20') + if self.n_values is None: + #TODO: What to do when classes are strings ? + classes = [le.classes_ for le in self.label_encoders_] + classes_max = [np.max(cls) + 1 for cls in classes] + cum_idx = np.cumsum([0] + classes_max) + active_idx = [self.label_encoders_[i].classes_.astype(np.int) + + cum_idx[i] + for i in range(self._n_features)] + + return np.concatenate(active_idx, axis=0).astype(np.int) + else: + raise AttributeError() - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - Input array of type int. + @property + def feature_indices_(self): + warnings.warn('The property `feature_indices_` is deprecated and' + ' will be removed in version 0.20') + classes_max = [np.max(le.classes_) + 1 for le in self.label_encoders_] + return np.cumsum([0] + classes_max) - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array, dtype=int - Transformed input. - """ - return _transform_selected(X, self._transform, - self.categorical_features, copy=True) + @property + def n_values_(self): + warnings.warn('The property `n_values_` is deprecated and' + ' will be removed in version 0.20') + return np.array([le.classes_.shape[0] for le in self.label_encoders_]) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 7a51049b60242..13bcf6c8e04ac 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -6,6 +6,7 @@ # License: BSD 3 clause import warnings +import re import numpy as np import numpy.linalg as la from scipy import sparse @@ -31,7 +32,7 @@ from sklearn.utils.testing import skip_if_32bit from sklearn.utils.sparsefuncs import mean_variance_axis -from sklearn.preprocessing.data import _transform_selected +from sklearn.preprocessing.data import _apply_selected from sklearn.preprocessing.data import _handle_zeros_in_scale from sklearn.preprocessing.data import Binarizer from sklearn.preprocessing.data import KernelCenterer @@ -1488,9 +1489,10 @@ def test_one_hot_encoder_sparse(): # test that an error is raised when out of bounds: X_too_large = [[0, 2, 1], [0, 1, 1]] assert_raises(ValueError, enc.transform, X_too_large) - error_msg = "unknown categorical feature present \[2\] during transform." + error_msg = re.escape("Unknown feature(s) [2] in column 1") assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X) + assert_raises(ValueError, OneHotEncoder(values=2).fit_transform, X) # test that error is raised when wrong number of features assert_raises(ValueError, enc.transform, X[:, :-1]) @@ -1500,14 +1502,6 @@ def test_one_hot_encoder_sparse(): # test exception on wrong init param assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) - enc = OneHotEncoder() - # test negative input to fit - assert_raises(ValueError, enc.fit, [[0], [-1]]) - - # test negative input to transform - enc.fit([[0], [1]]) - assert_raises(ValueError, enc.transform, [[0], [-1]]) - def test_one_hot_encoder_dense(): # check for sparse=False @@ -1526,26 +1520,26 @@ def test_one_hot_encoder_dense(): [1., 0., 1., 0., 1.]])) -def _check_transform_selected(X, X_expected, sel): +def _check_apply_selected(X, X_expected, sel): for M in (X, sparse.csr_matrix(X)): - Xtr = _transform_selected(M, Binarizer().transform, sel) + Xtr = _apply_selected(M, Binarizer().transform, sel) assert_array_equal(toarray(Xtr), X_expected) def test_transform_selected(): - X = [[3, 2, 1], [0, 1, 1]] + X = np.array([[3, 2, 1], [0, 1, 1]]) X_expected = [[1, 2, 1], [0, 1, 1]] - _check_transform_selected(X, X_expected, [0]) - _check_transform_selected(X, X_expected, [True, False, False]) + _check_apply_selected(X, X_expected, [0]) + _check_apply_selected(X, X_expected, [True, False, False]) X_expected = [[1, 1, 1], [0, 1, 1]] - _check_transform_selected(X, X_expected, [0, 1, 2]) - _check_transform_selected(X, X_expected, [True, True, True]) - _check_transform_selected(X, X_expected, "all") + _check_apply_selected(X, X_expected, [0, 1, 2]) + _check_apply_selected(X, X_expected, [True, True, True]) + _check_apply_selected(X, X_expected, "all") - _check_transform_selected(X, X, []) - _check_transform_selected(X, X, [False, False, False]) + _check_apply_selected(X, X, []) + _check_apply_selected(X, X, [False, False, False]) def test_transform_selected_copy_arg(): @@ -1558,8 +1552,8 @@ def _mutating_transformer(X): expected_Xtr = [[2, 2], [3, 4]] X = original_X.copy() - Xtr = _transform_selected(X, _mutating_transformer, copy=True, - selected='all') + Xtr = _apply_selected(X, _mutating_transformer, copy=True, + selected='all') assert_array_equal(toarray(X), toarray(original_X)) assert_array_equal(toarray(Xtr), expected_Xtr) @@ -1588,9 +1582,17 @@ def _check_one_hot(X, X2, cat, n_features): assert_array_equal(toarray(B), toarray(D)) +def test_one_hot_encoder_string(): + X = [['cat', 'domestic'], ['wolf', 'wild']] + enc = OneHotEncoder() + enc.fit(X) + Xtr = enc.transform([['cat', 'wild']]) + assert_array_equal(toarray(Xtr), [[1, 0, 0, 1]]) + + def test_one_hot_encoder_categorical_features(): X = np.array([[3, 2, 1], [0, 1, 1]]) - X2 = np.array([[1, 1, 1]]) + X2 = np.array([[3, 1, 1]]) cat = [True, False, False] _check_one_hot(X, X2, cat, 4) @@ -1621,7 +1623,23 @@ def test_one_hot_encoder_unknown_transform(): oh.transform(y).toarray(), np.array([[0., 0., 0., 0., 1., 0., 0.]])) - # Raise error if handle_unknown is neither ignore or error. + X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]]) + y = np.array([['ET', 1, 1]]) + + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown='error') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) + + # Test the ignore option, ignores unknown features. + oh = OneHotEncoder(handle_unknown='ignore') + oh.fit(X) + assert_array_equal( + oh.transform(y).toarray(), + np.array([[0., 0., 0., 0., 0., 1., 0., 0.]])) + + # Raise error if handle_unknown is neither ignore nor error. oh = OneHotEncoder(handle_unknown='42') oh.fit(X) assert_raises(ValueError, oh.transform, y) From e03b5c7e3e0be4b53c4d9b22deaab3a51557f87b Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Mon, 2 May 2016 11:45:26 -0400 Subject: [PATCH 02/36] ported functions to fixes.py --- sklearn/preprocessing/data.py | 32 +++++-------------------- sklearn/utils/fixes.py | 44 +++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 1fe0741d9db13..421e4d66073e5 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -27,7 +27,7 @@ min_max_axis) from ..utils.validation import check_is_fitted, FLOAT_DTYPES from .label import LabelEncoder -from ..utils.fixes import np_version +from ..utils.fixes import in1d, setdiff1d zip = six.moves.zip @@ -1651,10 +1651,8 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, return X n_features = X.shape[1] - ind = np.arange(n_features) sel = np.zeros(n_features, dtype=bool) sel[np.asarray(selected)] = True - not_sel = np.logical_not(sel) n_selected = np.sum(sel) if n_selected == 0: @@ -1664,8 +1662,8 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, # All features selected. return transform(X) else: - X_sel = transform(X[:, ind[sel]]) - X_not_sel = X[:, ind[not_sel]].astype(dtype) + X_sel = transform(X[:, sel]) + X_not_sel = X[:, ~sel].astype(dtype) if return_val: if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): @@ -1788,7 +1786,7 @@ def fit(self, X, y=None): return self def _fit(self, X): - "Assumes `X` contains only cetergorical features." + "Assumes `X` contains only catergorical features." X = check_array(X, dtype=np.object) n_samples, n_features = X.shape @@ -1870,31 +1868,13 @@ def _transform(self, X): X_mask = np.ones_like(X, dtype=np.bool) for i in range(n_features): - if np_version < (1, 8): - # in1d is not supported for object datatype in np < 1.8 - valid_mask = np.ones_like(X[:, i], dtype=np.bool) - found_classes = set(np.unique(X[:, i])) - valid_classes = set(self.label_encoders_[i].classes_) - invalid_classes = found_classes - valid_classes - - for item in invalid_classes: - mask = X[:, i] == item - np.logical_not(mask, mask) - np.logical_and(valid_mask, mask, valid_mask) - else: - valid_mask = np.in1d(X[:, i], self.label_encoders_[i].classes_) + valid_mask = in1d(X[:, i], self.label_encoders_[i].classes_) if not np.all(valid_mask): if self.handle_unknown == 'error': - if np_version < (1, 8): - valid_classes = set(self.label_encoders_[i].classes_) - diff = set(X[:, i]) - valid_classes - diff = list(diff) - else: - diff = np.setdiff1d(X[:, i], - self.label_encoders_[i].classes_) + diff = setdiff1d(X[:, i], self.label_encoders_[i].classes_) msg = 'Unknown feature(s) %s in column %d' % (diff, i) raise ValueError(msg) elif self.handle_unknown == 'ignore': diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index d789d5f525cd4..08c7edc3c28a1 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -217,13 +217,32 @@ def frombuffer_empty(buf, dtype): frombuffer_empty = np.frombuffer +def _in1d_object(ar1, ar2, invert=False): + # np.argsort(kind='mergesort') is only supported for object types after + # version 1.8. Hence in1d for object arrays needs to be handled differently + values1 = set(ar1) + values2 = set(ar2) + abset_values = values1 - values2 + + present = np.ones_like(ar1, dtype=np.bool) + + for value in abset_values: + present[ar1 == value] = False + + return ~present if invert else present + + if np_version < (1, 8): def in1d(ar1, ar2, assume_unique=False, invert=False): # Backport of numpy function in1d 1.8.1 to support numpy 1.6.2 # Ravel both arrays, behavior for the first array could be different + ar1 = np.asarray(ar1).ravel() ar2 = np.asarray(ar2).ravel() + if ar1.dtype == object or ar2.dtype == object: + return _in1d_object(ar1, ar2, invert) + # This code is significantly faster when the condition is satisfied. if len(ar2) < 10 * len(ar1) ** 0.145: if invert: @@ -408,3 +427,28 @@ def norm(X, ord=None, axis=None): else: norm = np.linalg.norm + + +if np_version < (1, 8): + # Backport of setdiff1d function as it relies on in1d + def setdiff1d(ar1, ar2, assume_unique=False): + # copy-paste from numpy except for the object type if clause + if assume_unique: + ar1 = np.asarray(ar1).ravel() + else: + # Unique is not supported for object arrays till np version 1.8 + # due to mergesort + if ar1.dtype == object: + ar1 = np.array(set(ar1)) + else: + ar1 = np.unique(ar1) + + if ar2.dtype == object: + ar2 = np.array(set(ar2)) + else: + ar1 = np.unique(ar2) + + return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)] + +else: + from numpy import setdiff1d From 06e6d3adb272bc99943d50eff0a17f0aaa50623b Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Mon, 2 May 2016 14:01:45 -0400 Subject: [PATCH 03/36] unique arrays are now sorted --- sklearn/utils/fixes.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 08c7edc3c28a1..9c50b38bee4ef 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -222,11 +222,11 @@ def _in1d_object(ar1, ar2, invert=False): # version 1.8. Hence in1d for object arrays needs to be handled differently values1 = set(ar1) values2 = set(ar2) - abset_values = values1 - values2 + absent_values = values1 - values2 present = np.ones_like(ar1, dtype=np.bool) - for value in abset_values: + for value in absent_values: present[ar1 == value] = False return ~present if invert else present @@ -439,16 +439,16 @@ def setdiff1d(ar1, ar2, assume_unique=False): # Unique is not supported for object arrays till np version 1.8 # due to mergesort if ar1.dtype == object: - ar1 = np.array(set(ar1)) + ar1 = np.array(sorted(set(ar1))) else: ar1 = np.unique(ar1) if ar2.dtype == object: - ar2 = np.array(set(ar2)) + ar2 = np.array(sorted(set(ar2))) else: - ar1 = np.unique(ar2) + ar2 = np.unique(ar2) - return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)] + return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)] else: from numpy import setdiff1d From 074f194e096000b243fb21dcf81cc7da1fc76e5d Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Mon, 2 May 2016 14:59:58 -0400 Subject: [PATCH 04/36] revert selection logic --- sklearn/preprocessing/data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 421e4d66073e5..eee4fe96bf12a 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1651,8 +1651,10 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, return X n_features = X.shape[1] + ind = np.arange(n_features) sel = np.zeros(n_features, dtype=bool) sel[np.asarray(selected)] = True + not_sel = np.logical_not(sel) n_selected = np.sum(sel) if n_selected == 0: @@ -1662,8 +1664,8 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, # All features selected. return transform(X) else: - X_sel = transform(X[:, sel]) - X_not_sel = X[:, ~sel].astype(dtype) + X_sel = transform(X[:, ind[sel]]) + X_not_sel = X[:, ind[not_sel]].astype(dtype) if return_val: if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): From 083142ed381bad23b97c395016a5f5d014a891e8 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Mon, 2 May 2016 16:49:41 -0400 Subject: [PATCH 05/36] Added copy argument --- sklearn/preprocessing/data.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index eee4fe96bf12a..139f3cac202fd 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1717,6 +1717,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Whether to raise an error or ignore if a unknown categorical feature is present during transform. + copy : bool, default=True + If unset, `X` maybe modified in space. + Attributes ---------- label_encoders_ : list of size n_features. @@ -1756,15 +1759,16 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): and n_classes-1. """ - def __init__(self, categorical_features="all", n_values=None, - values='auto', dtype=np.float64, sparse=True, - handle_unknown='error'): + def __init__(self, values='auto', categorical_features="all", + n_values=None, dtype=np.float64, sparse=True, + handle_unknown='error', copy=True): + self.values = values self.categorical_features = categorical_features self.dtype = dtype self.sparse = sparse self.handle_unknown = handle_unknown self.n_values = n_values - self.values = values + self.copy = copy def fit(self, X, y=None): """Fit the CategoricalEncoder to X. @@ -1779,7 +1783,7 @@ def fit(self, X, y=None): self """ - X = check_array(X, dtype=np.object, accept_sparse='csc') + X = check_array(X, dtype=np.object, accept_sparse='csc', copy=self.copy) n_samples, n_features = X.shape _apply_selected(X, self._fit, dtype=self.dtype, From f768f3bee800d72cff7315800b159cef3e68405c Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Wed, 31 Aug 2016 11:43:13 -0400 Subject: [PATCH 06/36] Inbetween adding the seen option --- sklearn/preprocessing/data.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 139f3cac202fd..ddeac726bc8bc 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1692,8 +1692,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - values : 'auto', int, list of ints, or list of lists of objects - - 'auto' : determine set of values from training data. + values : 'auto', 'seen', int, list of ints, or list of lists of objects + - 'auto' : determine set of values from training data. If the input + is an int array, values are determined from range in + training data. For all other inputs, only values observed + during `fit` are considered valid values for each feature. + - 'seen': Only values observed during `fit` are considered valid + values for each feature. - int : values are in ``range(values)`` for all features - list of ints : values for feature ``i`` are in ``range(values[i])`` - list of lists : values for feature ``i`` are in ``values[i]`` @@ -1783,7 +1788,8 @@ def fit(self, X, y=None): self """ - X = check_array(X, dtype=np.object, accept_sparse='csc', copy=self.copy) + X = check_array(X, dtype=np.object, accept_sparse='csc', + copy=self.copy) n_samples, n_features = X.shape _apply_selected(X, self._fit, dtype=self.dtype, @@ -1828,6 +1834,8 @@ def _fit(self, X): for i in range(n_features): le = self.label_encoders_[i] if self.values == 'auto': + le.fit(np.arange(1 + np.max(X[:, i]))) + elif self.values == 'seen': le.fit(X[:, i]) elif isinstance(self.values, numbers.Integral): if (np.max(X, axis=0) >= self.values).any(): From 1e34caef4a7682f8240503de52007b885b054641 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Thu, 1 Sep 2016 13:31:35 -0400 Subject: [PATCH 07/36] remove seen argument and support range case with FutureWarning --- sklearn/preprocessing/data.py | 46 ++++++++++++++++-------- sklearn/preprocessing/tests/test_data.py | 15 ++++++-- 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index ddeac726bc8bc..43c24f0a5f6c8 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1693,12 +1693,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- values : 'auto', 'seen', int, list of ints, or list of lists of objects - - 'auto' : determine set of values from training data. If the input - is an int array, values are determined from range in - training data. For all other inputs, only values observed - during `fit` are considered valid values for each feature. - - 'seen': Only values observed during `fit` are considered valid - values for each feature. + - 'auto' : determine set of values from training data. See the + documentation of `handle_unknown` for which values are considered + acceptable. - int : values are in ``range(values)`` for all features - list of ints : values for feature ``i`` are in ``range(values[i])`` - list of lists : values for feature ``i`` are in ``values[i]`` @@ -1719,8 +1716,12 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Will return sparse matrix if set True else will return an array. handle_unknown : str, 'error' or 'ignore' - Whether to raise an error or ignore if a unknown categorical feature is - present during transform. + + - 'ignore': Ignore all unknown feature values. + - 'error': Raise an error when the value of a feature is unseen during + `fit` and out of range of values seen during `fit`. + - 'error-strict': Raise an error when the value of a feature is unseen + during`fit`. copy : bool, default=True If unset, `X` maybe modified in space. @@ -1805,6 +1806,8 @@ def _fit(self, X): self._n_features = n_features self.label_encoders_ = [LabelEncoder() for i in range(n_features)] + # Maximum value for each featue + self._max_values = [None for i in range(n_features)] if self.n_values is not None: warnings.warn('The parameter `n_values` is deprecated, use the' @@ -1833,9 +1836,9 @@ def _fit(self, X): for i in range(n_features): le = self.label_encoders_[i] + + self._max_values[i] = np.max(X[:, i]) if self.values == 'auto': - le.fit(np.arange(1 + np.max(X[:, i]))) - elif self.values == 'seen': le.fit(X[:, i]) elif isinstance(self.values, numbers.Integral): if (np.max(X, axis=0) >= self.values).any(): @@ -1886,14 +1889,27 @@ def _transform(self, X): valid_mask = in1d(X[:, i], self.label_encoders_[i].classes_) if not np.all(valid_mask): - - if self.handle_unknown == 'error': + if self.handle_unknown in ['error', 'error-strict']: diff = setdiff1d(X[:, i], self.label_encoders_[i].classes_) - msg = 'Unknown feature(s) %s in column %d' % (diff, i) - raise ValueError(msg) + if self.handle_unknown == 'error-strict': + msg = 'Unknown feature(s) %s in column %d' % (diff, i) + raise ValueError(msg) + else: + if np.all(diff <= self._max_values[i]): + msg = ('Values %s for feature %d are unknown but ' + 'in range. This will raise an error in ' + 'future versions.' % (str(diff), i)) + warnings.warn(FutureWarning(msg)) + X_mask[:, i] = valid_mask + le = self.label_encoders_[i] + X[:, i][~valid_mask] = le.classes_[0] + else: + msg = ('Unknown feature(s) %s in column %d' % + (diff, i)) + raise ValueError(msg) elif self.handle_unknown == 'ignore': # Set the problematic rows to an acceptable value and - # continue `The rows are marked in `X_mask` and will be + # continue. The rows are marked in `X_mask` and will be # removed later. X_mask[:, i] = valid_mask X[:, i][~valid_mask] = self.label_encoders_[i].classes_[0] diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 13bcf6c8e04ac..a576a3058a9c9 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1592,7 +1592,7 @@ def test_one_hot_encoder_string(): def test_one_hot_encoder_categorical_features(): X = np.array([[3, 2, 1], [0, 1, 1]]) - X2 = np.array([[3, 1, 1]]) + X2 = np.array([[1, 1, 1]]) cat = [True, False, False] _check_one_hot(X, X2, cat, 4) @@ -1612,7 +1612,7 @@ def test_one_hot_encoder_unknown_transform(): # Test that one hot encoder raises error for unknown features # present during transform. - oh = OneHotEncoder(handle_unknown='error') + oh = OneHotEncoder(handle_unknown='error-strict') oh.fit(X) assert_raises(ValueError, oh.transform, y) @@ -1628,10 +1628,19 @@ def test_one_hot_encoder_unknown_transform(): # Test that one hot encoder raises error for unknown features # present during transform. - oh = OneHotEncoder(handle_unknown='error') + oh = OneHotEncoder(handle_unknown='error-strict') oh.fit(X) assert_raises(ValueError, oh.transform, y) + # Test that one hot encoder raises warning for unknown but in range + # features + oh = OneHotEncoder(handle_unknown='error') + oh.fit(X) + msg = ('Values [0] for feature 2 are unknown but in range. ' + 'This will raise an error in future versions.') + assert_warns_message(FutureWarning, msg, oh.transform, + np.array([[0, 0, 0]])) + # Test the ignore option, ignores unknown features. oh = OneHotEncoder(handle_unknown='ignore') oh.fit(X) From fed795953e93c7e8c7646b5ba51948fcbdaddaae Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Fri, 2 Sep 2016 04:37:10 -0400 Subject: [PATCH 08/36] Made label_encoders_ private --- sklearn/preprocessing/data.py | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 43c24f0a5f6c8..9ba3dc4b3e572 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1623,6 +1623,7 @@ def add_dummy_feature(X, value=1.0): def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, return_val=True): """Apply a function to portion of selected features + Parameters ---------- X : {array, sparse matrix}, shape [n_samples, n_features] @@ -1636,9 +1637,10 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, return_val : boolean, optional Whether to return the transformed matrix. If not set `None` is returned. + Returns ------- - X : array or sparse matrix, shape=(n_samples, n_features_new) + X : array or sparse matrix, shape=(n_samples, n_features_new) """ if copy: @@ -1728,11 +1730,6 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Attributes ---------- - label_encoders_ : list of size n_features. - The :class:`sklearn.preprocessing.LabelEncoder` objects used to encode - the features. ``self.label_encoders[i]_`` is the LabelEncoder object - used to encode the ith column. The unique features found on column - ``i`` can be accessed using ``self.label_encoders_[i].classes_``. Examples -------- @@ -1805,7 +1802,7 @@ def _fit(self, X): n_samples, n_features = X.shape self._n_features = n_features - self.label_encoders_ = [LabelEncoder() for i in range(n_features)] + self._label_encoders = [LabelEncoder() for i in range(n_features)] # Maximum value for each featue self._max_values = [None for i in range(n_features)] @@ -1835,7 +1832,7 @@ def _fit(self, X): " integers or a list of list") for i in range(n_features): - le = self.label_encoders_[i] + le = self._label_encoders[i] self._max_values[i] = np.max(X[:, i]) if self.values == 'auto': @@ -1886,11 +1883,12 @@ def _transform(self, X): for i in range(n_features): - valid_mask = in1d(X[:, i], self.label_encoders_[i].classes_) + valid_mask = in1d(X[:, i], self._label_encoders[i].classes_) if not np.all(valid_mask): if self.handle_unknown in ['error', 'error-strict']: - diff = setdiff1d(X[:, i], self.label_encoders_[i].classes_) + le = self._label_encoders[i] + diff = setdiff1d(X[:, i], le.classes_) if self.handle_unknown == 'error-strict': msg = 'Unknown feature(s) %s in column %d' % (diff, i) raise ValueError(msg) @@ -1901,7 +1899,7 @@ def _transform(self, X): 'future versions.' % (str(diff), i)) warnings.warn(FutureWarning(msg)) X_mask[:, i] = valid_mask - le = self.label_encoders_[i] + le = self._label_encoders[i] X[:, i][~valid_mask] = le.classes_[0] else: msg = ('Unknown feature(s) %s in column %d' % @@ -1912,16 +1910,16 @@ def _transform(self, X): # continue. The rows are marked in `X_mask` and will be # removed later. X_mask[:, i] = valid_mask - X[:, i][~valid_mask] = self.label_encoders_[i].classes_[0] + X[:, i][~valid_mask] = self._label_encoders[i].classes_[0] else: template = ("handle_unknown should be either 'error' or " "'ignore', got %s") raise ValueError(template % self.handle_unknown) - X_int[:, i] = self.label_encoders_[i].transform(X[:, i]) + X_int[:, i] = self._label_encoders[i].transform(X[:, i]) mask = X_mask.ravel() - n_values = [le.classes_.shape[0] for le in self.label_encoders_] + n_values = [le.classes_.shape[0] for le in self._label_encoders] n_values = np.hstack([[0], n_values]) indices = np.cumsum(n_values) @@ -1946,10 +1944,10 @@ def active_features_(self): ' will be removed in version 0.20') if self.n_values is None: #TODO: What to do when classes are strings ? - classes = [le.classes_ for le in self.label_encoders_] + classes = [le.classes_ for le in self._label_encoders] classes_max = [np.max(cls) + 1 for cls in classes] cum_idx = np.cumsum([0] + classes_max) - active_idx = [self.label_encoders_[i].classes_.astype(np.int) + active_idx = [self._label_encoders[i].classes_.astype(np.int) + cum_idx[i] for i in range(self._n_features)] @@ -1961,11 +1959,11 @@ def active_features_(self): def feature_indices_(self): warnings.warn('The property `feature_indices_` is deprecated and' ' will be removed in version 0.20') - classes_max = [np.max(le.classes_) + 1 for le in self.label_encoders_] + classes_max = [np.max(le.classes_) + 1 for le in self._label_encoders] return np.cumsum([0] + classes_max) @property def n_values_(self): warnings.warn('The property `n_values_` is deprecated and' ' will be removed in version 0.20') - return np.array([le.classes_.shape[0] for le in self.label_encoders_]) + return np.array([le.classes_.shape[0] for le in self._label_encoders]) From c62d2badc4ec1ca25138cd23fbd79fb05ced2b72 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Fri, 2 Sep 2016 06:27:30 -0400 Subject: [PATCH 09/36] Added new attributes and tests for OHE --- sklearn/preprocessing/data.py | 46 ++++++++++++++++++++++++ sklearn/preprocessing/tests/test_data.py | 19 ++++++++++ 2 files changed, 65 insertions(+) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 9ba3dc4b3e572..cad7b61d92a5a 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1730,6 +1730,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Attributes ---------- + feature_index_range_ : array, shape [n_feature, 2] + `feature_index_range_[i]` specifies the range of column indices + occupied by the feature `i` in the one-hot encoded array. + + one_hot_feature_index_ : array, shape [n_features_new] + `one_hot_feature_index_[i]` specifies which feature of the input + is encoded by column `i` in the one-hot encoded array. Examples -------- @@ -1793,6 +1800,45 @@ def fit(self, X, y=None): _apply_selected(X, self._fit, dtype=self.dtype, selected=self.categorical_features, copy=True, return_val=False) + + self.feature_index_range_ = np.zeros((n_features, 2), dtype=np.int) + + if (isinstance(self.categorical_features, six.string_types) and + self.categorical_features == "all"): + categorical = np.ones(n_features, dtype=bool) + else: + categorical = np.zeros(n_features, dtype=bool) + categorical[np.asarray(self.categorical_features)] = True + + num_cat = np.sum(categorical) + start = 0 + cat_index = 0 + #print(categorical, self.categorical_features) + for i in range(n_features): + if categorical[i]: + le = self._label_encoders[cat_index] + end = start + len(le.classes_) + self.feature_index_range_[i] = start, end + start += len(le.classes_) + cat_index += 1 + + indices = np.arange(start, start + n_features - num_cat) + self.feature_index_range_[~categorical, 0] = indices + indices += 1 + self.feature_index_range_[~categorical, 1] = indices + + if len(indices) > 0: + output_cols = indices[-1] + else: + output_cols = start + + print(output_cols) + self.one_hot_feature_index_ = np.empty(output_cols, dtype=np.int) + + for i in range(n_features): + s, e = self.feature_index_range_[i] + self.one_hot_feature_index_[s:e] = i + return self def _fit(self, X): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index a576a3058a9c9..9bb183528c963 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1503,6 +1503,25 @@ def test_one_hot_encoder_sparse(): assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) +def test_one_hot_encoder_attr(): + X = [[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]] + + enc = OneHotEncoder() + enc.fit(X) + assert_array_equal(enc.feature_index_range_, [[0, 3], [3, 5], [5, 7]]) + assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 1, 1, 2, 2]) + + enc = OneHotEncoder(categorical_features=[True, False, True]) + enc.fit(X) + assert_array_equal(enc.feature_index_range_, [[0, 3], [5, 6], [3, 5]]) + assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 2, 2, 1]) + + enc = OneHotEncoder(categorical_features=[False, False, True]) + enc.fit(X) + assert_array_equal(enc.feature_index_range_, [[2, 3], [3, 4], [0, 2]]) + assert_array_equal(enc.one_hot_feature_index_, [2, 2, 0, 1]) + + def test_one_hot_encoder_dense(): # check for sparse=False X = [[3, 2, 1], [0, 1, 1]] From e929f23838651773ee4a6e675be2acb3f8cb748d Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Fri, 2 Sep 2016 06:39:04 -0400 Subject: [PATCH 10/36] Fixed doctests --- doc/modules/preprocessing.rst | 2 +- sklearn/preprocessing/data.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 24df41f2966fa..c624cb836dc07 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -405,7 +405,7 @@ Continuing the example above:: array([[ 1., 0., 1., 0., 0., 1.]]) By default, how many values each feature can take is inferred automatically from the dataset. -It is possible to specify this explicitly using the parameter ``xvalues``. +It is possible to specify this explicitly using the parameter ``values``. There are two genders, three possible continents and four web browsers in our dataset. Then we fit the estimator, and transform a data point. diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index cad7b61d92a5a..121d4b8b7e9cd 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1747,10 +1747,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import OneHotEncoder >>> enc = OneHotEncoder() >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) # doctest: +ELLIPSIS - OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values=None, sparse=True, values='auto') - >>> list(enc.label_encoders_[0].classes_) - ['cat', 'dog', 'mouse'] + OneHotEncoder(categorical_features='all', copy=True, + dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, + sparse=True, values='auto') >>> enc.transform([['dog', 4]]).toarray() array([[ 0., 1., 0., 1., 0., 0.]]) @@ -1813,7 +1812,7 @@ def fit(self, X, y=None): num_cat = np.sum(categorical) start = 0 cat_index = 0 - #print(categorical, self.categorical_features) + for i in range(n_features): if categorical[i]: le = self._label_encoders[cat_index] @@ -1832,7 +1831,6 @@ def fit(self, X, y=None): else: output_cols = start - print(output_cols) self.one_hot_feature_index_ = np.empty(output_cols, dtype=np.int) for i in range(n_features): From bc7a26bfa7c30c719c185c4b903c3426c8878801 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Fri, 2 Sep 2016 08:07:00 -0400 Subject: [PATCH 11/36] Fixed rst doc tests --- doc/modules/preprocessing.rst | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index c624cb836dc07..9b4068c4710e1 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -399,8 +399,9 @@ Continuing the example above:: >>> enc = preprocessing.OneHotEncoder() >>> enc.fit([['female', 'from US', 'uses Chrome'], ... ['male', 'from Asia', 'uses Firefox']]) # doctest: +ELLIPSIS - OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values=None, sparse=True, values='auto') + OneHotEncoder(categorical_features='all', copy=True, + dtype=, handle_unknown='error', n_values=None, + sparse=True, values='auto') >>> enc.transform([['female', 'from Asia', 'uses Firefox']]).toarray() array([[ 1., 0., 1., 0., 0., 1.]]) @@ -423,9 +424,11 @@ features, one has to explicitly set ``n_values``. For example, >>> # feature >>> enc.fit([['female', 'from US', 'uses Chrome'], ... ['male', 'from Asia', 'uses Internet Explorer']]) # doctest: +ELLIPSIS - OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values=None, sparse=True, - values=[...]) + OneHotEncoder(categorical_features='all', copy=True, + dtype=, handle_unknown='error', n_values=None, + sparse=True, + values=[...]) + >>> enc.transform([['male', 'from Europe', 'uses Safari']]).toarray() array([[ 0., 1., 0., 1., 0., 0., 0., 0., 1.]]) From feaf0148033ac5d9f5c3b34edcc7e1512ae0d6b3 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Fri, 2 Sep 2016 08:09:41 -0400 Subject: [PATCH 12/36] Replaced type in array with ellipsis --- doc/modules/preprocessing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 9b4068c4710e1..68edaf934ac18 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -400,7 +400,7 @@ Continuing the example above:: >>> enc.fit([['female', 'from US', 'uses Chrome'], ... ['male', 'from Asia', 'uses Firefox']]) # doctest: +ELLIPSIS OneHotEncoder(categorical_features='all', copy=True, - dtype=, handle_unknown='error', n_values=None, + dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values='auto') >>> enc.transform([['female', 'from Asia', 'uses Firefox']]).toarray() array([[ 1., 0., 1., 0., 0., 1.]]) @@ -425,7 +425,7 @@ features, one has to explicitly set ``n_values``. For example, >>> enc.fit([['female', 'from US', 'uses Chrome'], ... ['male', 'from Asia', 'uses Internet Explorer']]) # doctest: +ELLIPSIS OneHotEncoder(categorical_features='all', copy=True, - dtype=, handle_unknown='error', n_values=None, + dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values=[...]) From 7b608e12651a83d7a12b0165bc4c4011d96117ba Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Fri, 2 Sep 2016 08:32:06 -0400 Subject: [PATCH 13/36] flake fixes --- sklearn/preprocessing/data.py | 10 +++++----- sklearn/utils/fixes.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 121d4b8b7e9cd..37478070ae2e8 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1802,8 +1802,9 @@ def fit(self, X, y=None): self.feature_index_range_ = np.zeros((n_features, 2), dtype=np.int) - if (isinstance(self.categorical_features, six.string_types) and - self.categorical_features == "all"): + if isinstance(self.categorical_features, six.string_types) and \ + self.categorical_features == "all": + categorical = np.ones(n_features, dtype=bool) else: categorical = np.zeros(n_features, dtype=bool) @@ -1987,12 +1988,11 @@ def active_features_(self): warnings.warn('The property `active_features_` is deprecated and' ' will be removed in version 0.20') if self.n_values is None: - #TODO: What to do when classes are strings ? classes = [le.classes_ for le in self._label_encoders] classes_max = [np.max(cls) + 1 for cls in classes] cum_idx = np.cumsum([0] + classes_max) - active_idx = [self._label_encoders[i].classes_.astype(np.int) - + cum_idx[i] + active_idx = [self._label_encoders[i].classes_.astype(np.int) + + cum_idx[i] for i in range(self._n_features)] return np.concatenate(active_idx, axis=0).astype(np.int) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 9c50b38bee4ef..fe37e9469c720 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -451,4 +451,4 @@ def setdiff1d(ar1, ar2, assume_unique=False): return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)] else: - from numpy import setdiff1d + from numpy import setdiff1d # noqa From 5f305d827a494db76ce7d9ef41e492e8233f4ae6 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Fri, 2 Sep 2016 10:36:36 -0400 Subject: [PATCH 14/36] Add NORMALIZE_WHITESPACE for python3 tests --- sklearn/preprocessing/data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 37478070ae2e8..b0071353b2163 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1746,8 +1746,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import OneHotEncoder >>> enc = OneHotEncoder() - >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) # doctest: +ELLIPSIS - OneHotEncoder(categorical_features='all', copy=True, + >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) \ + # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + OneHotEncoder(categorical_features='all', copy=True, dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values='auto') >>> enc.transform([['dog', 4]]).toarray() From 1392292679628ce1a2f4302ed9366ace76e745a4 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Fri, 2 Sep 2016 10:53:09 -0400 Subject: [PATCH 15/36] normalize whitespace for rst docs --- doc/modules/preprocessing.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 68edaf934ac18..622489c19ba13 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -423,7 +423,8 @@ features, one has to explicitly set ``n_values``. For example, >>> # Note that for there are missing categorical values for the 2nd and 3rd >>> # feature >>> enc.fit([['female', 'from US', 'uses Chrome'], - ... ['male', 'from Asia', 'uses Internet Explorer']]) # doctest: +ELLIPSIS + ... ['male', 'from Asia', 'uses Internet Explorer']]) \ + ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE OneHotEncoder(categorical_features='all', copy=True, dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, From 50d23607f2f3be389f011456a0280ceaa632d5c0 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Fri, 2 Sep 2016 11:06:42 -0400 Subject: [PATCH 16/36] normalizing whitespace again --- doc/modules/preprocessing.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 622489c19ba13..f1bfba00dde01 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -398,7 +398,8 @@ Continuing the example above:: >>> enc = preprocessing.OneHotEncoder() >>> enc.fit([['female', 'from US', 'uses Chrome'], - ... ['male', 'from Asia', 'uses Firefox']]) # doctest: +ELLIPSIS + ... ['male', 'from Asia', 'uses Firefox']]) \ + ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE OneHotEncoder(categorical_features='all', copy=True, dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values='auto') From 8f2f1d39b7e50c58d2a2454939a26df2fd8c1cd4 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Tue, 6 Sep 2016 14:03:25 -0400 Subject: [PATCH 17/36] docstring changes and minor optimizations --- sklearn/preprocessing/data.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index b0071353b2163..9f5d4ea205c28 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1694,7 +1694,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - values : 'auto', 'seen', int, list of ints, or list of lists of objects + values : 'auto', int, list of ints, or list of lists of objects - 'auto' : determine set of values from training data. See the documentation of `handle_unknown` for which values are considered acceptable. @@ -1731,11 +1731,11 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Attributes ---------- feature_index_range_ : array, shape [n_feature, 2] - `feature_index_range_[i]` specifies the range of column indices - occupied by the feature `i` in the one-hot encoded array. + ``feature_index_range_[i]`` specifies the range of column indices + occupied by the input feature `i` in the one-hot encoded array. one_hot_feature_index_ : array, shape [n_features_new] - `one_hot_feature_index_[i]` specifies which feature of the input + ``one_hot_feature_index_[i]`` specifies which feature of the input is encoded by column `i` in the one-hot encoded array. Examples @@ -1820,7 +1820,7 @@ def fit(self, X, y=None): le = self._label_encoders[cat_index] end = start + len(le.classes_) self.feature_index_range_[i] = start, end - start += len(le.classes_) + start = end cat_index += 1 indices = np.arange(start, start + n_features - num_cat) @@ -1844,7 +1844,8 @@ def fit(self, X, y=None): def _fit(self, X): "Assumes `X` contains only catergorical features." - X = check_array(X, dtype=np.object) + if not np.issubdtype(X.dtype.type, np.integer): + X = check_array(X, dtype=np.object) n_samples, n_features = X.shape self._n_features = n_features @@ -1854,7 +1855,7 @@ def _fit(self, X): if self.n_values is not None: warnings.warn('The parameter `n_values` is deprecated, use the' - 'parameter `classes_` instead and specify the ' + 'parameter `values` instead and specify the ' 'expected values for each feature') if isinstance(self.n_values, numbers.Integral): From 1c8accfa89881b6c3c4bece0f166115758876a95 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Wed, 28 Dec 2016 13:45:28 +0530 Subject: [PATCH 18/36] Made tests pass by creating arrays with object dtype --- sklearn/preprocessing/tests/test_data.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 9bb183528c963..512866a5475f0 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1642,8 +1642,9 @@ def test_one_hot_encoder_unknown_transform(): oh.transform(y).toarray(), np.array([[0., 0., 0., 0., 1., 0., 0.]])) - X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]]) - y = np.array([['ET', 1, 1]]) + X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]], + dtype=np.object) + y = np.array([['ET', 1, 1]], dtype=np.object) # Test that one hot encoder raises error for unknown features # present during transform. @@ -1658,7 +1659,7 @@ def test_one_hot_encoder_unknown_transform(): msg = ('Values [0] for feature 2 are unknown but in range. ' 'This will raise an error in future versions.') assert_warns_message(FutureWarning, msg, oh.transform, - np.array([[0, 0, 0]])) + np.array([['mouse', 0, 0]], dtype=np.object)) # Test the ignore option, ignores unknown features. oh = OneHotEncoder(handle_unknown='ignore') From 6edda8b8bea06b1a71ba691c62849d1214c69194 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Wed, 28 Dec 2016 15:17:57 +0530 Subject: [PATCH 19/36] Assign both values and n_values to self._values and remove redundant checking --- sklearn/preprocessing/data.py | 67 +++++++++++------------- sklearn/preprocessing/tests/test_data.py | 11 ++-- 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 9f5d4ea205c28..533ef8e020f92 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1720,8 +1720,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): handle_unknown : str, 'error' or 'ignore' - 'ignore': Ignore all unknown feature values. - - 'error': Raise an error when the value of a feature is unseen during - `fit` and out of range of values seen during `fit`. + - 'error': Raise an error when the value of a feature is more than the + maximum value seen during fit. - 'error-strict': Raise an error when the value of a feature is unseen during`fit`. @@ -1851,29 +1851,17 @@ def _fit(self, X): self._n_features = n_features self._label_encoders = [LabelEncoder() for i in range(n_features)] # Maximum value for each featue - self._max_values = [None for i in range(n_features)] + self._max_values = [None] * n_features if self.n_values is not None: - warnings.warn('The parameter `n_values` is deprecated, use the' + warnings.warn('`n_values` has been renamed to `values`.' + 'The parameter `n_values` is deprecated, use the' 'parameter `values` instead and specify the ' 'expected values for each feature') - if isinstance(self.n_values, numbers.Integral): - if (np.max(X, axis=0) >= self.n_values).any(): - raise ValueError("Feature out of bounds for n_values=%d" - % self.n_values) - self.values = self.n_values - else: - try: - n_values = np.asarray(self.n_values, dtype=int) - except (ValueError, TypeError): - raise TypeError("Wrong type for parameter `n_values`." - " Expected 'auto', int or array of ints," - "got %r" % type(X)) - if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: - raise ValueError("Shape mismatch: if n_values is an array," - " it has to be of shape (n_features,).") - self.values = list(self.n_values) + self._values = self.n_values + else: + self._values = self.values error_msg = ("`values` should be 'auto', an integer, a list of" " integers or a list of list") @@ -1882,25 +1870,32 @@ def _fit(self, X): le = self._label_encoders[i] self._max_values[i] = np.max(X[:, i]) - if self.values == 'auto': + + if isinstance(self._values, numbers.Integral): + self._values = np.ones(n_features, dtype=np.int) * self._values + + if self._values == 'auto': le.fit(X[:, i]) - elif isinstance(self.values, numbers.Integral): - if (np.max(X, axis=0) >= self.values).any(): - raise ValueError("Feature out of bounds for n_values=%d" - % self.values) - le.fit(np.arange(self.values, dtype=np.int)) - elif isinstance(self.values, list): - if len(self.values) != X.shape[1]: - raise ValueError("Shape mismatch: if n_values is a list," + + elif (isinstance(self._values, list) or + isinstance(self._values, np.ndarray)): + if len(self._values) != X.shape[1]: + raise ValueError("Shape mismatch: if values is a list," " it has to be of length (n_features).") - if isinstance(self.values[i], list): - le.fit(self.values[i]) - elif isinstance(self.values[i], numbers.Integral): - le.fit(np.arange(self.values[i], dtype=np.int)) + if isinstance(self._values[i], list): + le.fit(self._values[i]) + elif np.isscalar(self._values[i]): + le.fit(np.arange(self._values[i], dtype=np.int)) + X_feature_max = np.max(X, axis=0) + mask = X_feature_max >= self._values + if mask.any(): + msg = 'Value(s) %s out of bounds for feature(s) %s' + raise ValueError(msg % (X_feature_max[mask], + np.where(mask)[0])) else: raise ValueError(error_msg) else: - raise ValueError(error_msg) + raise TypeError(error_msg) def transform(self, X, y=None): """Encode the selected categorical features using the one-hot scheme. @@ -1943,7 +1938,9 @@ def _transform(self, X): if np.all(diff <= self._max_values[i]): msg = ('Values %s for feature %d are unknown but ' 'in range. This will raise an error in ' - 'future versions.' % (str(diff), i)) + 'future versions where "error-strict" will ' + 'be default for `handle_unknown` parameter' + % (str(diff), i)) warnings.warn(FutureWarning(msg)) X_mask[:, i] = valid_mask le = self._label_encoders[i] diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 512866a5475f0..f451176ce4eeb 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1491,8 +1491,12 @@ def test_one_hot_encoder_sparse(): assert_raises(ValueError, enc.transform, X_too_large) error_msg = re.escape("Unknown feature(s) [2] in column 1") assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) - assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X) - assert_raises(ValueError, OneHotEncoder(values=2).fit_transform, X) + + error_msg = re.escape("Value(s) [2] out of bounds for feature(s) [0]") + assert_raises_regex(ValueError, error_msg, + OneHotEncoder(n_values=2).fit_transform, X) + assert_raises_regex(ValueError, error_msg, + OneHotEncoder(values=2).fit_transform, X) # test that error is raised when wrong number of features assert_raises(ValueError, enc.transform, X[:, :-1]) @@ -1657,7 +1661,8 @@ def test_one_hot_encoder_unknown_transform(): oh = OneHotEncoder(handle_unknown='error') oh.fit(X) msg = ('Values [0] for feature 2 are unknown but in range. ' - 'This will raise an error in future versions.') + 'This will raise an error in future versions where "error-strict"' + ' will be default for `handle_unknown` parameter') assert_warns_message(FutureWarning, msg, oh.transform, np.array([['mouse', 0, 0]], dtype=np.object)) From 1d2ca1aea5fb8d9a5b6689739750c55f98366001 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Wed, 28 Dec 2016 16:38:54 +0530 Subject: [PATCH 20/36] removed extra spaces for flake8 compat --- sklearn/preprocessing/tests/test_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index f451176ce4eeb..7752231e66e70 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1671,7 +1671,7 @@ def test_one_hot_encoder_unknown_transform(): oh.fit(X) assert_array_equal( oh.transform(y).toarray(), - np.array([[0., 0., 0., 0., 0., 1., 0., 0.]])) + np.array([[0., 0., 0., 0., 0., 1., 0., 0.]])) # Raise error if handle_unknown is neither ignore nor error. oh = OneHotEncoder(handle_unknown='42') From 93ae49e1a6aefaf955a2919028cfbf31a930218a Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Tue, 25 Apr 2017 13:08:17 -0500 Subject: [PATCH 21/36] REF Refactor OHE and avoid copies Refactor the OneHotEncoder for easier reading. Avoid mandatory copies of input data in both the `fit` and `transform` steps. Add a test that the input data aren't modified after fitting or transforming. --- sklearn/preprocessing/data.py | 252 ++++++++++++----------- sklearn/preprocessing/tests/test_data.py | 35 +++- 2 files changed, 160 insertions(+), 127 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 533ef8e020f92..99c7f16025c48 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -19,7 +19,7 @@ from ..utils import check_array from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var -from ..utils.fixes import bincount +from ..utils.fixes import bincount, sparse_min_max from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2) from ..utils.sparsefuncs import (inplace_column_scale, @@ -1642,7 +1642,6 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, ------- X : array or sparse matrix, shape=(n_samples, n_features_new) """ - if copy: X = X.copy() @@ -1653,7 +1652,6 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, return X n_features = X.shape[1] - ind = np.arange(n_features) sel = np.zeros(n_features, dtype=bool) sel[np.asarray(selected)] = True not_sel = np.logical_not(sel) @@ -1666,10 +1664,10 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, # All features selected. return transform(X) else: - X_sel = transform(X[:, ind[sel]]) - X_not_sel = X[:, ind[not_sel]].astype(dtype) + X_sel = transform(X[:, sel]) if return_val: + X_not_sel = X[:, not_sel].astype(dtype) if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): return sparse.hstack((X_sel, X_not_sel)) else: @@ -1717,7 +1715,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): sparse : boolean, default=True Will return sparse matrix if set True else will return an array. - handle_unknown : str, 'error' or 'ignore' + handle_unknown : str, 'error', 'error-strict', or 'ignore' - 'ignore': Ignore all unknown feature values. - 'error': Raise an error when the value of a feature is more than the @@ -1725,9 +1723,6 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): - 'error-strict': Raise an error when the value of a feature is unseen during`fit`. - copy : bool, default=True - If unset, `X` maybe modified in space. - Attributes ---------- feature_index_range_ : array, shape [n_feature, 2] @@ -1748,7 +1743,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): >>> enc = OneHotEncoder() >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) \ # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - OneHotEncoder(categorical_features='all', copy=True, + OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values='auto') >>> enc.transform([['dog', 4]]).toarray() @@ -1768,20 +1763,18 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 and n_classes-1. """ - def __init__(self, values='auto', categorical_features="all", n_values=None, dtype=np.float64, sparse=True, - handle_unknown='error', copy=True): + handle_unknown='error'): self.values = values self.categorical_features = categorical_features self.dtype = dtype self.sparse = sparse self.handle_unknown = handle_unknown self.n_values = n_values - self.copy = copy def fit(self, X, y=None): - """Fit the CategoricalEncoder to X. + """Fit the OneHotEncoder to X. Parameters ---------- @@ -1792,13 +1785,11 @@ def fit(self, X, y=None): ------- self """ - - X = check_array(X, dtype=np.object, accept_sparse='csc', - copy=self.copy) + X = check_array(X, dtype=np.object, accept_sparse='csc', copy=False) n_samples, n_features = X.shape _apply_selected(X, self._fit, dtype=self.dtype, - selected=self.categorical_features, copy=True, + selected=self.categorical_features, copy=False, return_val=False) self.feature_index_range_ = np.zeros((n_features, 2), dtype=np.int) @@ -1812,7 +1803,7 @@ def fit(self, X, y=None): categorical[np.asarray(self.categorical_features)] = True num_cat = np.sum(categorical) - start = 0 + start, end = 0, 0 cat_index = 0 for i in range(n_features): @@ -1828,12 +1819,8 @@ def fit(self, X, y=None): indices += 1 self.feature_index_range_[~categorical, 1] = indices - if len(indices) > 0: - output_cols = indices[-1] - else: - output_cols = start - - self.one_hot_feature_index_ = np.empty(output_cols, dtype=np.int) + n_expanded_cols = end + n_features - num_cat + self.one_hot_feature_index_ = np.empty(n_expanded_cols, dtype=np.int) for i in range(n_features): s, e = self.feature_index_range_[i] @@ -1841,61 +1828,110 @@ def fit(self, X, y=None): return self - def _fit(self, X): - "Assumes `X` contains only catergorical features." + def _check_values(self, values, n_features, max_values): + """Verify that the input `values` is valid + + Raises ValueError or TypeError for bad `values`. + """ + error_msg = ("`values` should be 'auto', an integer, a list of" + " integers or a list of list") + if isinstance(values, six.string_types): + # Input "auto": determine values automatically + if values != 'auto': + raise ValueError(error_msg) + elif (isinstance(values, list) or + isinstance(values, np.ndarray)): + if len(values) != n_features: + raise ValueError("Shape mismatch: if values is a list," + " it has to be of length (n_features).") + + # Either all entries are scalars or none are + scalar_vals = [np.isscalar(val) for val in values] + if not (all(scalar_vals) or not any(scalar_vals)): + raise ValueError(error_msg) + elif not np.isscalar(values): + raise TypeError(error_msg) + + # Validate input data against user-supplied categories + if not np.isscalar(values) and np.isscalar(values[0]): + too_big = np.zeros(n_features, dtype=bool) + for i_col in range(n_features): + if not np.isfinite(max_values[i_col]): + # String features; don't bounds-check + continue + if max_values[i_col] >= values[i_col]: + too_big[i_col] = True + + if too_big.any(): + msg = 'Value(s) %s out of bounds for feature(s) %s' + raise ValueError(msg % (max_values[too_big], + np.where(too_big)[0])) + + def _check_features_greater_than_zero(self, X): + """Raise a ValueError if X has numerical values less than 0""" + if sparse.issparse(X): + min_values, _ = sparse_min_max(X, axis=0) + else: + min_values = np.min(X, axis=0) + lt_zero = np.zeros(X.shape[1], dtype=bool) + for i_value, value in enumerate(min_values): + if isinstance(value, six.string_types): + continue + elif value < 0: + lt_zero[i_value] = True + + if np.any(lt_zero): + raise ValueError('Column(s) %s have numerical values less ' + 'than zero.', np.where(lt_zero)[0]) - if not np.issubdtype(X.dtype.type, np.integer): - X = check_array(X, dtype=np.object) + def _fit(self, X): + """Assumes `X` contains only categorical features""" n_samples, n_features = X.shape self._n_features = n_features self._label_encoders = [LabelEncoder() for i in range(n_features)] - # Maximum value for each featue - self._max_values = [None] * n_features + self._set_max_values(X) + self._check_features_greater_than_zero(X) + # Set up and check user-input categories. if self.n_values is not None: warnings.warn('`n_values` has been renamed to `values`.' - 'The parameter `n_values` is deprecated, use the' + 'The parameter `n_values` has been deprecated ' + 'and will be removed in version 0.21, use the' 'parameter `values` instead and specify the ' 'expected values for each feature') - self._values = self.n_values else: self._values = self.values + if (not isinstance(self._values, six.string_types) and + np.isscalar(self._values)): + # Expect all categoricals to be integers with max `values` + self._values = np.ones(n_features, dtype=np.int) * self._values + self._check_values(self._values, n_features, self._max_values) - error_msg = ("`values` should be 'auto', an integer, a list of" - " integers or a list of list") - + # Fit on categorical features in the data for i in range(n_features): le = self._label_encoders[i] - self._max_values[i] = np.max(X[:, i]) - - if isinstance(self._values, numbers.Integral): - self._values = np.ones(n_features, dtype=np.int) * self._values - - if self._values == 'auto': + if np.isscalar(self._values) and self._values == 'auto': le.fit(X[:, i]) - - elif (isinstance(self._values, list) or - isinstance(self._values, np.ndarray)): - if len(self._values) != X.shape[1]: - raise ValueError("Shape mismatch: if values is a list," - " it has to be of length (n_features).") + else: if isinstance(self._values[i], list): le.fit(self._values[i]) elif np.isscalar(self._values[i]): le.fit(np.arange(self._values[i], dtype=np.int)) - X_feature_max = np.max(X, axis=0) - mask = X_feature_max >= self._values - if mask.any(): - msg = 'Value(s) %s out of bounds for feature(s) %s' - raise ValueError(msg % (X_feature_max[mask], - np.where(mask)[0])) - else: - raise ValueError(error_msg) - else: - raise TypeError(error_msg) + + def _set_max_values(self, X): + """Inspect input data to determine the maximum value in each column""" + if sparse.issparse(X): + min_values, max_values = sparse_min_max(X, axis=0) + else: + max_values = np.max(X, axis=0) + self._max_values = np.zeros(len(max_values)) + np.nan + for i_value, value in enumerate(max_values): + if isinstance(value, six.string_types): + continue + self._max_values[i_value] = value def transform(self, X, y=None): """Encode the selected categorical features using the one-hot scheme. @@ -1910,57 +1946,47 @@ def transform(self, X, y=None): out : array, shape[n_samples, n_features_new] `X` encoded using the one-hot scheme. """ - X = check_array(X, dtype=np.object) + if self.handle_unknown not in ['ignore', 'error', 'error-strict']: + template = ("handle_unknown should be either 'error', " + "'error-strict', or 'ignore', got %s") + raise ValueError(template % self.handle_unknown) - return _apply_selected(X, self._transform, copy=True, + X = check_array(X, accept_sparse='csc', dtype=np.object, copy=False) + + return _apply_selected(X, self._transform, copy=False, selected=self.categorical_features) def _transform(self, X): - "Assumes `X` contains only categorical features." - - X = check_array(X, accept_sparse='csc', dtype=np.object) + """Assumes `X` contains only categorical features.""" n_samples, n_features = X.shape X_int = np.zeros_like(X, dtype=np.int) X_mask = np.ones_like(X, dtype=np.bool) for i in range(n_features): - - valid_mask = in1d(X[:, i], self._label_encoders[i].classes_) - + le = self._label_encoders[i] + valid_mask = in1d(X[:, i], le.classes_) if not np.all(valid_mask): if self.handle_unknown in ['error', 'error-strict']: - le = self._label_encoders[i] diff = setdiff1d(X[:, i], le.classes_) - if self.handle_unknown == 'error-strict': + if (self.handle_unknown == 'error-strict' or + np.isfinite(self._max_values[i]) and + np.any(diff >= self._max_values[i]) or + np.any(diff < 0)): msg = 'Unknown feature(s) %s in column %d' % (diff, i) raise ValueError(msg) else: - if np.all(diff <= self._max_values[i]): - msg = ('Values %s for feature %d are unknown but ' - 'in range. This will raise an error in ' - 'future versions where "error-strict" will ' - 'be default for `handle_unknown` parameter' - % (str(diff), i)) - warnings.warn(FutureWarning(msg)) - X_mask[:, i] = valid_mask - le = self._label_encoders[i] - X[:, i][~valid_mask] = le.classes_[0] - else: - msg = ('Unknown feature(s) %s in column %d' % - (diff, i)) - raise ValueError(msg) - elif self.handle_unknown == 'ignore': - # Set the problematic rows to an acceptable value and - # continue. The rows are marked in `X_mask` and will be - # removed later. - X_mask[:, i] = valid_mask - X[:, i][~valid_mask] = self._label_encoders[i].classes_[0] - else: - template = ("handle_unknown should be either 'error' or " - "'ignore', got %s") - raise ValueError(template % self.handle_unknown) - - X_int[:, i] = self._label_encoders[i].transform(X[:, i]) + msg = ('Values %s for feature %d are unknown but ' + 'in range. This will raise an error in ' + 'future versions where "error-strict" will ' + 'be default for `handle_unknown` parameter' + % (str(diff), i)) + warnings.warn(FutureWarning(msg)) + + X_mask[:, i] = valid_mask + X_int[valid_mask, i] = (self._label_encoders[i] + .transform(X[valid_mask, i])) + else: + X_int[:, i] = self._label_encoders[i].transform(X[:, i]) mask = X_mask.ravel() n_values = [le.classes_.shape[0] for le in self._label_encoders] @@ -1970,43 +1996,37 @@ def _transform(self, X): column_indices = (X_int + indices[:-1]).ravel()[mask] row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features)[mask] - data = np.ones(n_samples * n_features)[mask] + data = np.ones(np.sum(mask)) out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() - if (isinstance(self.n_values, six.string_types) and - self.n_values == 'auto'): - out = out[:, self.active_features_] - return out if self.sparse else out.toarray() @property def active_features_(self): warnings.warn('The property `active_features_` is deprecated and' - ' will be removed in version 0.20') - if self.n_values is None: - classes = [le.classes_ for le in self._label_encoders] - classes_max = [np.max(cls) + 1 for cls in classes] - cum_idx = np.cumsum([0] + classes_max) - active_idx = [self._label_encoders[i].classes_.astype(np.int) + - cum_idx[i] - for i in range(self._n_features)] - - return np.concatenate(active_idx, axis=0).astype(np.int) - else: - raise AttributeError() + ' will be removed in version 0.21') + n_features_out = sum([len(le.classes_) for le in self._label_encoders]) + return np.arange(n_features_out) @property def feature_indices_(self): warnings.warn('The property `feature_indices_` is deprecated and' - ' will be removed in version 0.20') - classes_max = [np.max(le.classes_) + 1 for le in self._label_encoders] - return np.cumsum([0] + classes_max) + ' will be removed in version 0.21') + n_categories = [len(le.classes_) for le in self._label_encoders] + return np.cumsum([0] + n_categories) @property def n_values_(self): warnings.warn('The property `n_values_` is deprecated and' - ' will be removed in version 0.20') - return np.array([le.classes_.shape[0] for le in self._label_encoders]) + ' will be removed in version 0.21') + # The effective number of categories is different depending on + # whether or not we're using the old-style behavior + if self.handle_unknown == 'error': + return np.array([np.max(le.classes_) + 1 + for le in self._label_encoders]) + else: + return np.array([le.classes_.shape[0] + for le in self._label_encoders]) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 7752231e66e70..357951b87a370 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1461,9 +1461,8 @@ def test_one_hot_encoder_sparse(): # discover max values automatically X_trans = enc.fit_transform(X).toarray() assert_equal(X_trans.shape, (2, 5)) - assert_array_equal(enc.active_features_, - np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) - assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) + assert_array_equal(enc.active_features_, np.arange(5)) + assert_array_equal(enc.feature_indices_, [0, 2, 4, 5]) # check outcome assert_array_equal(X_trans, @@ -1471,13 +1470,13 @@ def test_one_hot_encoder_sparse(): [1., 0., 1., 0., 1.]]) # max value given as 3 - enc = OneHotEncoder(n_values=4) + enc = OneHotEncoder(values=4) X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 4 * 3)) assert_array_equal(enc.feature_indices_, [0, 4, 8, 12]) # max value given per feature - enc = OneHotEncoder(n_values=[3, 2, 2]) + enc = OneHotEncoder(values=[3, 2, 2]) X = [[1, 0, 1], [0, 1, 1]] X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 3 + 2 + 2)) @@ -1492,11 +1491,11 @@ def test_one_hot_encoder_sparse(): error_msg = re.escape("Unknown feature(s) [2] in column 1") assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) - error_msg = re.escape("Value(s) [2] out of bounds for feature(s) [0]") + error_msg = re.escape("Value(s) [ 2.] out of bounds for feature(s) [0]") assert_raises_regex(ValueError, error_msg, - OneHotEncoder(n_values=2).fit_transform, X) + OneHotEncoder(n_values=2).fit, X) assert_raises_regex(ValueError, error_msg, - OneHotEncoder(values=2).fit_transform, X) + OneHotEncoder(values=2).fit, X) # test that error is raised when wrong number of features assert_raises(ValueError, enc.transform, X[:, :-1]) @@ -1507,6 +1506,16 @@ def test_one_hot_encoder_sparse(): assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) +def test_one_hot_encoder_error_on_negative(): + # Negative numerical values in inputs should raise an exception + X_bad = [[-1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]] + X_good = [[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]] + assert_raises(ValueError, OneHotEncoder().fit, X_bad) + + ohe = OneHotEncoder().fit(X_good) + assert_raises(ValueError, ohe.transform, X_bad) + + def test_one_hot_encoder_attr(): X = [[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]] @@ -1533,9 +1542,8 @@ def test_one_hot_encoder_dense(): # discover max values automatically X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 5)) - assert_array_equal(enc.active_features_, - np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) - assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) + assert_array_equal(enc.active_features_, np.arange(5)) + assert_array_equal(enc.feature_indices_, [0, 2, 4, 5]) # check outcome assert_array_equal(X_trans, @@ -1632,12 +1640,14 @@ def test_one_hot_encoder_categorical_features(): def test_one_hot_encoder_unknown_transform(): X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) y = np.array([[4, 1, 1]]) + X_orig = X.copy() # Verify X is not modified # Test that one hot encoder raises error for unknown features # present during transform. oh = OneHotEncoder(handle_unknown='error-strict') oh.fit(X) assert_raises(ValueError, oh.transform, y) + assert_array_equal(X, X_orig) # Test the ignore option, ignores unknown features. oh = OneHotEncoder(handle_unknown='ignore') @@ -1645,10 +1655,12 @@ def test_one_hot_encoder_unknown_transform(): assert_array_equal( oh.transform(y).toarray(), np.array([[0., 0., 0., 0., 1., 0., 0.]])) + assert_array_equal(X, X_orig) X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]], dtype=np.object) y = np.array([['ET', 1, 1]], dtype=np.object) + X_orig = X.copy() # Verify X is not modified # Test that one hot encoder raises error for unknown features # present during transform. @@ -1672,6 +1684,7 @@ def test_one_hot_encoder_unknown_transform(): assert_array_equal( oh.transform(y).toarray(), np.array([[0., 0., 0., 0., 0., 1., 0., 0.]])) + assert_array_equal(X, X_orig) # Raise error if handle_unknown is neither ignore nor error. oh = OneHotEncoder(handle_unknown='42') From fd11366f3e4e238e7188488c9b376c445b165c45 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 26 Apr 2017 14:16:03 -0500 Subject: [PATCH 22/36] WIP --- sklearn/preprocessing/data.py | 136 +++++++++++++---------- sklearn/preprocessing/tests/test_data.py | 10 +- 2 files changed, 86 insertions(+), 60 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 99c7f16025c48..1785e85fd6214 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1692,7 +1692,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - values : 'auto', int, list of ints, or list of lists of objects + values : 'auto', 'auto-strict', int, List[int], or List[List[objects]] - 'auto' : determine set of values from training data. See the documentation of `handle_unknown` for which values are considered acceptable. @@ -1785,43 +1785,39 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, dtype=np.object, accept_sparse='csc', copy=False) + X = check_array(X, dtype=None, accept_sparse='csc', copy=False) n_samples, n_features = X.shape _apply_selected(X, self._fit, dtype=self.dtype, selected=self.categorical_features, copy=False, return_val=False) + # Record which columns of output data + # correspond to each column of input data self.feature_index_range_ = np.zeros((n_features, 2), dtype=np.int) if isinstance(self.categorical_features, six.string_types) and \ self.categorical_features == "all": - categorical = np.ones(n_features, dtype=bool) else: categorical = np.zeros(n_features, dtype=bool) categorical[np.asarray(self.categorical_features)] = True - num_cat = np.sum(categorical) start, end = 0, 0 - cat_index = 0 - - for i in range(n_features): - if categorical[i]: - le = self._label_encoders[cat_index] - end = start + len(le.classes_) - self.feature_index_range_[i] = start, end - start = end - cat_index += 1 - - indices = np.arange(start, start + n_features - num_cat) - self.feature_index_range_[~categorical, 0] = indices - indices += 1 - self.feature_index_range_[~categorical, 1] = indices + for i_cat, i_feat in enumerate(np.where(categorical)[0]): + le = self._label_encoders[i_cat] + end = start + len(le.classes_) + self.feature_index_range_[i_feat] = start, end + start = end + num_cat = np.sum(categorical) + non_cat_indices = np.arange(start, start + n_features - num_cat) + self.feature_index_range_[~categorical, 0] = non_cat_indices + self.feature_index_range_[~categorical, 1] = non_cat_indices + 1 + # Record which column of input data corresponds + # to each column of output data n_expanded_cols = end + n_features - num_cat self.one_hot_feature_index_ = np.empty(n_expanded_cols, dtype=np.int) - for i in range(n_features): s, e = self.feature_index_range_[i] self.one_hot_feature_index_[s:e] = i @@ -1832,12 +1828,14 @@ def _check_values(self, values, n_features, max_values): """Verify that the input `values` is valid Raises ValueError or TypeError for bad `values`. + Assume that lists of integers have been converted + to lists of arrays before getting here. """ error_msg = ("`values` should be 'auto', an integer, a list of" " integers or a list of list") if isinstance(values, six.string_types): # Input "auto": determine values automatically - if values != 'auto': + if values not in ['auto', 'auto-strict']: raise ValueError(error_msg) elif (isinstance(values, list) or isinstance(values, np.ndarray)): @@ -1845,30 +1843,34 @@ def _check_values(self, values, n_features, max_values): raise ValueError("Shape mismatch: if values is a list," " it has to be of length (n_features).") - # Either all entries are scalars or none are + # All entries are arrays or lists scalar_vals = [np.isscalar(val) for val in values] - if not (all(scalar_vals) or not any(scalar_vals)): + if any(scalar_vals): raise ValueError(error_msg) elif not np.isscalar(values): raise TypeError(error_msg) - + """ # Validate input data against user-supplied categories - if not np.isscalar(values) and np.isscalar(values[0]): + if not np.isscalar(values): too_big = np.zeros(n_features, dtype=bool) for i_col in range(n_features): if not np.isfinite(max_values[i_col]): # String features; don't bounds-check continue - if max_values[i_col] >= values[i_col]: + if max_values[i_col] > max(values[i_col]): too_big[i_col] = True if too_big.any(): msg = 'Value(s) %s out of bounds for feature(s) %s' raise ValueError(msg % (max_values[too_big], np.where(too_big)[0])) - + """ def _check_features_greater_than_zero(self, X): """Raise a ValueError if X has numerical values less than 0""" + if X.dtype.kind == 'U': + # Don't check string arrays + return + if sparse.issparse(X): min_values, _ = sparse_min_max(X, axis=0) else: @@ -1884,15 +1886,7 @@ def _check_features_greater_than_zero(self, X): raise ValueError('Column(s) %s have numerical values less ' 'than zero.', np.where(lt_zero)[0]) - def _fit(self, X): - """Assumes `X` contains only categorical features""" - n_samples, n_features = X.shape - - self._n_features = n_features - self._label_encoders = [LabelEncoder() for i in range(n_features)] - self._set_max_values(X) - self._check_features_greater_than_zero(X) - + def _initialize_values(self): # Set up and check user-input categories. if self.n_values is not None: warnings.warn('`n_values` has been renamed to `values`.' @@ -1900,26 +1894,60 @@ def _fit(self, X): 'and will be removed in version 0.21, use the' 'parameter `values` instead and specify the ' 'expected values for each feature') - self._values = self.n_values + values = self.n_values else: - self._values = self.values - if (not isinstance(self._values, six.string_types) and - np.isscalar(self._values)): - # Expect all categoricals to be integers with max `values` - self._values = np.ones(n_features, dtype=np.int) * self._values + values = self.values + + # Convert `int` and `Sequence[int]` inputs to `List[Array[int]]` + if (not isinstance(values, six.string_types) and + np.isscalar(values)): + values = np.ones(self._n_features, dtype=int) * values + if (not isinstance(values, six.string_types) and + np.isscalar(values[0])): + values = [np.arange(v, dtype=np.int) for v in values] + + return values + + def _fit(self, X): + """Assumes `X` contains only categorical features""" + n_samples, n_features = X.shape + + self._n_features = n_features + self._label_encoders = [LabelEncoder() for i in range(n_features)] + self._set_max_values(X) + self._check_features_greater_than_zero(X) + self._values = self._initialize_values() self._check_values(self._values, n_features, self._max_values) + _auto_int_classes = n_features * [None] + # Fit on categorical features in the data for i in range(n_features): le = self._label_encoders[i] if np.isscalar(self._values) and self._values == 'auto': - le.fit(X[:, i]) + if (not isinstance(X[0, i], six.string_types) and + int(X[0, i]) == X[0, i]): + _auto_int_classes[i] = np.unique(X[:, i]) + n_classes = np.max(_auto_int_classes[i]) + 1 + le.fit(np.arange(n_classes)) + else: + le.fit(X[:, i]) else: - if isinstance(self._values[i], list): - le.fit(self._values[i]) - elif np.isscalar(self._values[i]): - le.fit(np.arange(self._values[i], dtype=np.int)) + le.fit(self._values[i]) + + if np.isscalar(self._values) and self._values == 'auto': + active_features = [] + for i_col, int_classes in enumerate(_auto_int_classes): + if int_classes is None: + n_classes = len(self._label_encoders[i_col].classes_) + active_features.append(np.ones(n_classes, dtype=bool)) + else: + n_classes = max(self._label_encoders[i_col].classes_) + 1 + this_col_mask = np.zeros(n_classes, dtype=bool) + this_col_mask[int_classes] = True + active_features.append(this_col_mask) + self.active_features_ = np.where(np.hstack(active_features))[0] def _set_max_values(self, X): """Inspect input data to determine the maximum value in each column""" @@ -1951,10 +1979,10 @@ def transform(self, X, y=None): "'error-strict', or 'ignore', got %s") raise ValueError(template % self.handle_unknown) - X = check_array(X, accept_sparse='csc', dtype=np.object, copy=False) + X = check_array(X, accept_sparse='csc', dtype=None, copy=False) - return _apply_selected(X, self._transform, copy=False, - selected=self.categorical_features) + return _apply_selected(X, self._transform, dtype=self.dtype, + selected=self.categorical_features, copy=False) def _transform(self, X): """Assumes `X` contains only categorical features.""" @@ -2002,14 +2030,10 @@ def _transform(self, X): shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() - return out if self.sparse else out.toarray() + if np.isscalar(self._values) and self._values == 'auto': + out = out[:, self.active_features_] - @property - def active_features_(self): - warnings.warn('The property `active_features_` is deprecated and' - ' will be removed in version 0.21') - n_features_out = sum([len(le.classes_) for le in self._label_encoders]) - return np.arange(n_features_out) + return out if self.sparse else out.toarray() @property def feature_indices_(self): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 357951b87a370..90f0d1670441c 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1461,8 +1461,9 @@ def test_one_hot_encoder_sparse(): # discover max values automatically X_trans = enc.fit_transform(X).toarray() assert_equal(X_trans.shape, (2, 5)) - assert_array_equal(enc.active_features_, np.arange(5)) - assert_array_equal(enc.feature_indices_, [0, 2, 4, 5]) + assert_array_equal(enc.active_features_, + np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) + assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) # check outcome assert_array_equal(X_trans, @@ -1542,8 +1543,9 @@ def test_one_hot_encoder_dense(): # discover max values automatically X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 5)) - assert_array_equal(enc.active_features_, np.arange(5)) - assert_array_equal(enc.feature_indices_, [0, 2, 4, 5]) + assert_array_equal(enc.active_features_, + np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) + assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) # check outcome assert_array_equal(X_trans, From b96a8d2d0a7cf5a25a7f1fc5e106024797fbb894 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 26 Apr 2017 16:07:32 -0500 Subject: [PATCH 23/36] Remove error-strict, add auto-strict Also restore `n_values_` and `active_features_` attributes. --- sklearn/preprocessing/data.py | 222 ++++++++++------------- sklearn/preprocessing/tests/test_data.py | 44 ++--- 2 files changed, 120 insertions(+), 146 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 1785e85fd6214..f8b251ea3499f 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1693,9 +1693,11 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- values : 'auto', 'auto-strict', int, List[int], or List[List[objects]] - - 'auto' : determine set of values from training data. See the - documentation of `handle_unknown` for which values are considered - acceptable. + - 'auto' : Determine set of values from training data. + If values are integers, then allowed values will be between + 0 and the maximum value in the data. + - 'auto-strict' : Determine set of values from the training data. + Only values in the original training data are valid. - int : values are in ``range(values)`` for all features - list of ints : values for feature ``i`` are in ``range(values[i])`` - list of lists : values for feature ``i`` are in ``values[i]`` @@ -1715,13 +1717,10 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): sparse : boolean, default=True Will return sparse matrix if set True else will return an array. - handle_unknown : str, 'error', 'error-strict', or 'ignore' - + handle_unknown : str, 'error' or 'ignore' - 'ignore': Ignore all unknown feature values. - - 'error': Raise an error when the value of a feature is more than the - maximum value seen during fit. - - 'error-strict': Raise an error when the value of a feature is unseen - during`fit`. + - 'error': Raise an error when the value of a feature was not + in the original fit data (or given through ``values``). Attributes ---------- @@ -1733,6 +1732,14 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): ``one_hot_feature_index_[i]`` specifies which feature of the input is encoded by column `i` in the one-hot encoded array. + active_features_ : array + Indices for active features, meaning values that actually occur + in the training set. Only available when n_values is ``'auto'``. + + n_values_ : array of shape (n_features,) + Number of categories per feature. Has value `0` for + non-categorical features. + Examples -------- Given a dataset with three features and four samples, we let the encoder @@ -1741,11 +1748,18 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import OneHotEncoder >>> enc = OneHotEncoder() - >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) \ + >>> enc.fit(np.array([['cat', 4], ['mouse', 15], ['dog', 17]], dtype='O'))\ # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values='auto') + >>> enc.n_values_ + array([3, 18]) + >>> enc.feature_index_range_ + array([[ 0, 3], + [ 3, 6]]) + >>> enc.one_hot_feature_index_ + array([0, 0, 0, 1, 1, 1]) >>> enc.transform([['dog', 4]]).toarray() array([[ 0., 1., 0., 1., 0., 0.]]) @@ -1787,10 +1801,10 @@ def fit(self, X, y=None): """ X = check_array(X, dtype=None, accept_sparse='csc', copy=False) n_samples, n_features = X.shape + self.n_features_ = n_features - _apply_selected(X, self._fit, dtype=self.dtype, - selected=self.categorical_features, copy=False, - return_val=False) + _apply_selected(X, self._fit, dtype=self.dtype, return_val=False, + selected=self.categorical_features, copy=False) # Record which columns of output data # correspond to each column of input data @@ -1805,8 +1819,10 @@ def fit(self, X, y=None): start, end = 0, 0 for i_cat, i_feat in enumerate(np.where(categorical)[0]): - le = self._label_encoders[i_cat] - end = start + len(le.classes_) + if np.isscalar(self._values) and self._values == 'auto': + end = start + self.n_active_features_[i_cat] + else: + end = start + len(self._label_encoders[i_cat].classes_) self.feature_index_range_[i_feat] = start, end start = end num_cat = np.sum(categorical) @@ -1822,17 +1838,24 @@ def fit(self, X, y=None): s, e = self.feature_index_range_[i] self.one_hot_feature_index_[s:e] = i + # Count categories per feature + n_val = len(non_cat_indices) * [0] + if hasattr(self, '_label_encoders'): + n_val = [len(le.classes_) for le in self._label_encoders] + n_val + self.n_values_ = np.array(n_val) + return self - def _check_values(self, values, n_features, max_values): + def _check_values(self, values, n_features): """Verify that the input `values` is valid Raises ValueError or TypeError for bad `values`. - Assume that lists of integers have been converted - to lists of arrays before getting here. + Assume that integers or lists of integers have been + converted to lists of arrays before getting here. + This should run after `_initialize_values`. """ - error_msg = ("`values` should be 'auto', an integer, a list of" - " integers or a list of list") + error_msg = ("`values` should be 'auto', 'auto-strict', an integer, " + "a list of integers or a list of list") if isinstance(values, six.string_types): # Input "auto": determine values automatically if values not in ['auto', 'auto-strict']: @@ -1847,47 +1870,11 @@ def _check_values(self, values, n_features, max_values): scalar_vals = [np.isscalar(val) for val in values] if any(scalar_vals): raise ValueError(error_msg) - elif not np.isscalar(values): - raise TypeError(error_msg) - """ - # Validate input data against user-supplied categories - if not np.isscalar(values): - too_big = np.zeros(n_features, dtype=bool) - for i_col in range(n_features): - if not np.isfinite(max_values[i_col]): - # String features; don't bounds-check - continue - if max_values[i_col] > max(values[i_col]): - too_big[i_col] = True - - if too_big.any(): - msg = 'Value(s) %s out of bounds for feature(s) %s' - raise ValueError(msg % (max_values[too_big], - np.where(too_big)[0])) - """ - def _check_features_greater_than_zero(self, X): - """Raise a ValueError if X has numerical values less than 0""" - if X.dtype.kind == 'U': - # Don't check string arrays - return - - if sparse.issparse(X): - min_values, _ = sparse_min_max(X, axis=0) else: - min_values = np.min(X, axis=0) - lt_zero = np.zeros(X.shape[1], dtype=bool) - for i_value, value in enumerate(min_values): - if isinstance(value, six.string_types): - continue - elif value < 0: - lt_zero[i_value] = True - - if np.any(lt_zero): - raise ValueError('Column(s) %s have numerical values less ' - 'than zero.', np.where(lt_zero)[0]) + raise TypeError(error_msg) def _initialize_values(self): - # Set up and check user-input categories. + """Standardize the `values` input""" if self.n_values is not None: warnings.warn('`n_values` has been renamed to `values`.' 'The parameter `n_values` has been deprecated ' @@ -1901,7 +1888,7 @@ def _initialize_values(self): # Convert `int` and `Sequence[int]` inputs to `List[Array[int]]` if (not isinstance(values, six.string_types) and np.isscalar(values)): - values = np.ones(self._n_features, dtype=int) * values + values = np.ones(self.n_features_cat_, dtype=int) * values if (not isinstance(values, six.string_types) and np.isscalar(values[0])): values = [np.arange(v, dtype=np.int) for v in values] @@ -1911,32 +1898,41 @@ def _initialize_values(self): def _fit(self, X): """Assumes `X` contains only categorical features""" n_samples, n_features = X.shape - - self._n_features = n_features + self.n_features_cat_ = n_features self._label_encoders = [LabelEncoder() for i in range(n_features)] - self._set_max_values(X) - self._check_features_greater_than_zero(X) - self._values = self._initialize_values() - self._check_values(self._values, n_features, self._max_values) - _auto_int_classes = n_features * [None] + self._values = self._initialize_values() + self._check_values(self._values, n_features) # Fit on categorical features in the data + _auto_int_classes = n_features * [None] for i in range(n_features): le = self._label_encoders[i] if np.isscalar(self._values) and self._values == 'auto': + # For integer features, allow integers between + # 0 and column max. The transform will still only + # return dummy columns for integers present in training data. if (not isinstance(X[0, i], six.string_types) and int(X[0, i]) == X[0, i]): - _auto_int_classes[i] = np.unique(X[:, i]) + _auto_int_classes[i] = np.unique(X[:, i]).astype(int) + if np.min(_auto_int_classes[i]) < 0: + msg = ('Column %s has value(s) less than zero; all ' + 'integer columns must have minimum value ' + '0 when value="auto".') + raise ValueError(msg) n_classes = np.max(_auto_int_classes[i]) + 1 le.fit(np.arange(n_classes)) else: le.fit(X[:, i]) + elif np.isscalar(self._values) and self._values == 'auto-strict': + le.fit(X[:, i]) else: le.fit(self._values[i]) if np.isscalar(self._values) and self._values == 'auto': + # Record which integer features were present in training + # data so we can restrict output columns. active_features = [] for i_col, int_classes in enumerate(_auto_int_classes): if int_classes is None: @@ -1947,20 +1943,10 @@ def _fit(self, X): this_col_mask = np.zeros(n_classes, dtype=bool) this_col_mask[int_classes] = True active_features.append(this_col_mask) + self.n_active_features_ = np.array([a.sum() + for a in active_features]) self.active_features_ = np.where(np.hstack(active_features))[0] - def _set_max_values(self, X): - """Inspect input data to determine the maximum value in each column""" - if sparse.issparse(X): - min_values, max_values = sparse_min_max(X, axis=0) - else: - max_values = np.max(X, axis=0) - self._max_values = np.zeros(len(max_values)) + np.nan - for i_value, value in enumerate(max_values): - if isinstance(value, six.string_types): - continue - self._max_values[i_value] = value - def transform(self, X, y=None): """Encode the selected categorical features using the one-hot scheme. @@ -1974,12 +1960,15 @@ def transform(self, X, y=None): out : array, shape[n_samples, n_features_new] `X` encoded using the one-hot scheme. """ - if self.handle_unknown not in ['ignore', 'error', 'error-strict']: - template = ("handle_unknown should be either 'error', " - "'error-strict', or 'ignore', got %s") + if self.handle_unknown not in ['ignore', 'error']: + template = ("handle_unknown should be either 'error' " + "or 'ignore', got %s") raise ValueError(template % self.handle_unknown) X = check_array(X, accept_sparse='csc', dtype=None, copy=False) + if X.shape[1] != self.n_features_: + raise ValueError("Input data must have %s " + "features." % self.n_features_) return _apply_selected(X, self._transform, dtype=self.dtype, selected=self.categorical_features, copy=False) @@ -1987,44 +1976,37 @@ def transform(self, X, y=None): def _transform(self, X): """Assumes `X` contains only categorical features.""" n_samples, n_features = X.shape - X_int = np.zeros_like(X, dtype=np.int) - X_mask = np.ones_like(X, dtype=np.bool) + X_int = np.zeros_like(X, dtype=np.int32) - for i in range(n_features): - le = self._label_encoders[i] - valid_mask = in1d(X[:, i], le.classes_) - if not np.all(valid_mask): - if self.handle_unknown in ['error', 'error-strict']: + # Recode all columns of input data as integers + if self.handle_unknown == 'error': + for i, le in enumerate(self._label_encoders): + try: + X_int[:, i] = le.transform(X[:, i]) + except ValueError: diff = setdiff1d(X[:, i], le.classes_) - if (self.handle_unknown == 'error-strict' or - np.isfinite(self._max_values[i]) and - np.any(diff >= self._max_values[i]) or - np.any(diff < 0)): - msg = 'Unknown feature(s) %s in column %d' % (diff, i) - raise ValueError(msg) - else: - msg = ('Values %s for feature %d are unknown but ' - 'in range. This will raise an error in ' - 'future versions where "error-strict" will ' - 'be default for `handle_unknown` parameter' - % (str(diff), i)) - warnings.warn(FutureWarning(msg)) - - X_mask[:, i] = valid_mask - X_int[valid_mask, i] = (self._label_encoders[i] - .transform(X[valid_mask, i])) - else: - X_int[:, i] = self._label_encoders[i].transform(X[:, i]) + msg = 'Unknown feature(s) %s in column %d' % (diff, i) + raise ValueError(msg) + mask = slice(None) + else: + X_mask = np.ones_like(X, dtype=np.bool) + for i, le in enumerate(self._label_encoders): + valid_mask = in1d(X[:, i], le.classes_) + if not np.all(valid_mask): + X_mask[:, i] = valid_mask + X_int[valid_mask, i] = le.transform(X[valid_mask, i]) + else: + X_int[:, i] = le.transform(X[:, i]) + mask = X_mask.ravel() - mask = X_mask.ravel() - n_values = [le.classes_.shape[0] for le in self._label_encoders] - n_values = np.hstack([[0], n_values]) + # Convert integer columns to sparse array of binary indicators + n_values = [0] + [le.classes_.shape[0] for le in self._label_encoders] indices = np.cumsum(n_values) column_indices = (X_int + indices[:-1]).ravel()[mask] row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features)[mask] - data = np.ones(np.sum(mask)) + data = np.ones(len(row_indices), dtype=self.dtype) out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), @@ -2037,20 +2019,10 @@ def _transform(self, X): @property def feature_indices_(self): + # This is very similar to the current attribute + # `feature_index_range_`, but only applies to the + # subset of categorical features. warnings.warn('The property `feature_indices_` is deprecated and' ' will be removed in version 0.21') n_categories = [len(le.classes_) for le in self._label_encoders] return np.cumsum([0] + n_categories) - - @property - def n_values_(self): - warnings.warn('The property `n_values_` is deprecated and' - ' will be removed in version 0.21') - # The effective number of categories is different depending on - # whether or not we're using the old-style behavior - if self.handle_unknown == 'error': - return np.array([np.max(le.classes_) + 1 - for le in self._label_encoders]) - else: - return np.array([le.classes_.shape[0] - for le in self._label_encoders]) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 90f0d1670441c..fb43e519eaaf3 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1492,11 +1492,11 @@ def test_one_hot_encoder_sparse(): error_msg = re.escape("Unknown feature(s) [2] in column 1") assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) - error_msg = re.escape("Value(s) [ 2.] out of bounds for feature(s) [0]") + error_msg = re.escape("Unknown feature(s) [2] in column 0") assert_raises_regex(ValueError, error_msg, - OneHotEncoder(n_values=2).fit, X) + OneHotEncoder(n_values=2).fit_transform, X) assert_raises_regex(ValueError, error_msg, - OneHotEncoder(values=2).fit, X) + OneHotEncoder(values=2).fit_transform, X) # test that error is raised when wrong number of features assert_raises(ValueError, enc.transform, X[:, :-1]) @@ -1509,8 +1509,8 @@ def test_one_hot_encoder_sparse(): def test_one_hot_encoder_error_on_negative(): # Negative numerical values in inputs should raise an exception - X_bad = [[-1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]] - X_good = [[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]] + X_bad = np.array([[-1, "cat"], [10, "mouse"], [5, "cat"]], dtype=np.object) + X_good = np.array([[1, "cat"], [10, "mouse"], [5, "cat"]], dtype=np.object) assert_raises(ValueError, OneHotEncoder().fit, X_bad) ohe = OneHotEncoder().fit(X_good) @@ -1518,7 +1518,7 @@ def test_one_hot_encoder_error_on_negative(): def test_one_hot_encoder_attr(): - X = [[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]] + X = np.array([[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]], dtype='O') enc = OneHotEncoder() enc.fit(X) @@ -1639,14 +1639,14 @@ def test_one_hot_encoder_categorical_features(): _check_one_hot(X, X2, cat, 5) -def test_one_hot_encoder_unknown_transform(): +def test_one_hot_encoder_unknown_transform_int(): X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) - y = np.array([[4, 1, 1]]) + y = np.array([[0, 3, 1]]) X_orig = X.copy() # Verify X is not modified # Test that one hot encoder raises error for unknown features # present during transform. - oh = OneHotEncoder(handle_unknown='error-strict') + oh = OneHotEncoder(handle_unknown='error') oh.fit(X) assert_raises(ValueError, oh.transform, y) assert_array_equal(X, X_orig) @@ -1656,9 +1656,21 @@ def test_one_hot_encoder_unknown_transform(): oh.fit(X) assert_array_equal( oh.transform(y).toarray(), - np.array([[0., 0., 0., 0., 1., 0., 0.]])) + np.array([[1., 0., 0., 0., 1., 0., 0.]])) assert_array_equal(X, X_orig) + # Test that there's no error for integer features in the auto range + y = [[0, 1, 1]] + assert_array_equal(oh.transform(y).toarray(), + np.array([[1., 0., 0., 0., 1., 0., 0.]])) + + # But we do error when fit with "auto-strict" + oh = OneHotEncoder(values='auto-strict', handle_unknown='error') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) + + +def test_one_hot_encoder_unknown_transform_object(): X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]], dtype=np.object) y = np.array([['ET', 1, 1]], dtype=np.object) @@ -1666,19 +1678,9 @@ def test_one_hot_encoder_unknown_transform(): # Test that one hot encoder raises error for unknown features # present during transform. - oh = OneHotEncoder(handle_unknown='error-strict') - oh.fit(X) - assert_raises(ValueError, oh.transform, y) - - # Test that one hot encoder raises warning for unknown but in range - # features oh = OneHotEncoder(handle_unknown='error') oh.fit(X) - msg = ('Values [0] for feature 2 are unknown but in range. ' - 'This will raise an error in future versions where "error-strict"' - ' will be default for `handle_unknown` parameter') - assert_warns_message(FutureWarning, msg, oh.transform, - np.array([['mouse', 0, 0]], dtype=np.object)) + assert_raises(ValueError, oh.transform, y) # Test the ignore option, ignores unknown features. oh = OneHotEncoder(handle_unknown='ignore') From 7902352bcd75613fb15a050d92117ef8ac5eff1e Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 26 Apr 2017 16:30:04 -0500 Subject: [PATCH 24/36] Fixes for test failures --- sklearn/preprocessing/data.py | 8 ++++---- sklearn/utils/fixes.py | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index f8b251ea3499f..c562539d4c9c4 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -19,7 +19,7 @@ from ..utils import check_array from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var -from ..utils.fixes import bincount, sparse_min_max +from ..utils.fixes import bincount from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2) from ..utils.sparsefuncs import (inplace_column_scale, @@ -1754,10 +1754,10 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values='auto') >>> enc.n_values_ - array([3, 18]) + array([ 3, 18]) >>> enc.feature_index_range_ - array([[ 0, 3], - [ 3, 6]]) + array([[0, 3], + [3, 6]]) >>> enc.one_hot_feature_index_ array([0, 0, 0, 1, 1, 1]) >>> enc.transform([['dog', 4]]).toarray() diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index fe37e9469c720..d44555503eaa2 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -240,7 +240,8 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): ar1 = np.asarray(ar1).ravel() ar2 = np.asarray(ar2).ravel() - if ar1.dtype == object or ar2.dtype == object: + if (ar1.dtype == object or ar2.dtype == object or + ar1.dtype.kind == 'U' or ar2.dtype.kind == 'U'): return _in1d_object(ar1, ar2, invert) # This code is significantly faster when the condition is satisfied. From 4206d797128f39902639f528e23ac2fadb991a2b Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 26 Apr 2017 17:54:21 -0500 Subject: [PATCH 25/36] ENH Handle object and string types in LabelEncoder.transform Numpy v1.6 doesn't handle `setdiff1d` for string types; use a backported version in `utils.fixes`. --- sklearn/preprocessing/label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f2f7d9afad347..3957cb4a63e56 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -18,7 +18,7 @@ from ..utils.fixes import np_version from ..utils.fixes import sparse_min_max from ..utils.fixes import astype -from ..utils.fixes import in1d +from ..utils.fixes import in1d, setdiff1d from ..utils import column_or_1d from ..utils.validation import check_array from ..utils.validation import check_is_fitted @@ -149,7 +149,7 @@ def transform(self, y): classes = np.unique(y) _check_numpy_unicode_bug(classes) if len(np.intersect1d(classes, self.classes_)) < len(classes): - diff = np.setdiff1d(classes, self.classes_) + diff = setdiff1d(classes, self.classes_) raise ValueError("y contains new labels: %s" % str(diff)) return np.searchsorted(self.classes_, y) From d96fbc63da3c106c75c72d831dead43eaa6b74d7 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 26 Apr 2017 17:56:06 -0500 Subject: [PATCH 26/36] Fix tests --- doc/modules/preprocessing.rst | 2 +- sklearn/preprocessing/data.py | 12 +++++++----- sklearn/preprocessing/tests/test_data.py | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index f1bfba00dde01..d9d6209f123d6 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -400,7 +400,7 @@ Continuing the example above:: >>> enc.fit([['female', 'from US', 'uses Chrome'], ... ['male', 'from Asia', 'uses Firefox']]) \ ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - OneHotEncoder(categorical_features='all', copy=True, + OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values='auto') >>> enc.transform([['female', 'from Asia', 'uses Firefox']]).toarray() diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index c562539d4c9c4..cf6f3b214a01a 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -7,7 +7,6 @@ # License: BSD 3 clause from itertools import chain, combinations -import numbers import warnings from itertools import combinations_with_replacement as combinations_w_r @@ -27,7 +26,7 @@ min_max_axis) from ..utils.validation import check_is_fitted, FLOAT_DTYPES from .label import LabelEncoder -from ..utils.fixes import in1d, setdiff1d +from ..utils.fixes import in1d zip = six.moves.zip @@ -1983,9 +1982,12 @@ def _transform(self, X): for i, le in enumerate(self._label_encoders): try: X_int[:, i] = le.transform(X[:, i]) - except ValueError: - diff = setdiff1d(X[:, i], le.classes_) - msg = 'Unknown feature(s) %s in column %d' % (diff, i) + except ValueError as err: + orig_msg = str(err) + if not orig_msg.startswith('y contains'): + raise + else: + msg = 'Column %d %s' % (i, orig_msg[2:]) raise ValueError(msg) mask = slice(None) else: diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index fb43e519eaaf3..34077f2694388 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1489,10 +1489,10 @@ def test_one_hot_encoder_sparse(): # test that an error is raised when out of bounds: X_too_large = [[0, 2, 1], [0, 1, 1]] assert_raises(ValueError, enc.transform, X_too_large) - error_msg = re.escape("Unknown feature(s) [2] in column 1") + error_msg = re.escape("Column 1 contains new labels: [2]") assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) - error_msg = re.escape("Unknown feature(s) [2] in column 0") + error_msg = re.escape("Column 0 contains new labels: [2]") assert_raises_regex(ValueError, error_msg, OneHotEncoder(n_values=2).fit_transform, X) assert_raises_regex(ValueError, error_msg, From 0807604f59b4a769c3ce3dfff4a26a5475497370 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 26 Apr 2017 19:31:59 -0500 Subject: [PATCH 27/36] Fix for doc test and scipy 0.11 sparse behavior --- doc/modules/preprocessing.rst | 2 +- sklearn/preprocessing/data.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index d9d6209f123d6..cf5c312eb5e06 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -426,7 +426,7 @@ features, one has to explicitly set ``n_values``. For example, >>> enc.fit([['female', 'from US', 'uses Chrome'], ... ['male', 'from Asia', 'uses Internet Explorer']]) \ ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - OneHotEncoder(categorical_features='all', copy=True, + OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values=[...]) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index cf6f3b214a01a..1696d6846b419 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1663,10 +1663,11 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, # All features selected. return transform(X) else: - X_sel = transform(X[:, sel]) + ind = np.arange(n_features) + X_sel = transform(X[:, ind[sel]]) if return_val: - X_not_sel = X[:, not_sel].astype(dtype) + X_not_sel = X[:, ind[not_sel]].astype(dtype) if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): return sparse.hstack((X_sel, X_not_sel)) else: From b6d198ad291e6574527fd50b8e036435be76778e Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 26 Apr 2017 19:51:10 -0500 Subject: [PATCH 28/36] ENH Enforce dtypes in _apply_selected --- sklearn/preprocessing/data.py | 11 +++++++---- sklearn/preprocessing/tests/test_data.py | 9 +++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 1696d6846b419..ac705501f3180 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1645,10 +1645,11 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, X = X.copy() if isinstance(selected, six.string_types) and selected == "all": - return transform(X) + X_trans = transform(X) + return X_trans.astype(dtype) if return_val else None if len(selected) == 0: - return X + return X.astype(dtype) if return_val else None n_features = X.shape[1] sel = np.zeros(n_features, dtype=bool) @@ -1658,15 +1659,17 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, if n_selected == 0: # No features selected. - return X + return X.astype(dtype) if return_val else None elif n_selected == n_features: # All features selected. - return transform(X) + X_trans = transform(X) + return X_trans.astype(dtype) if return_val else None else: ind = np.arange(n_features) X_sel = transform(X[:, ind[sel]]) if return_val: + X_sel = X_sel.astype(dtype) X_not_sel = X[:, ind[not_sel]].astype(dtype) if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): return sparse.hstack((X_sel, X_not_sel)) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 34077f2694388..770b5ff2e4af8 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1553,10 +1553,11 @@ def test_one_hot_encoder_dense(): [1., 0., 1., 0., 1.]])) -def _check_apply_selected(X, X_expected, sel): +def _check_apply_selected(X, X_expected, sel, dtype=np.float): for M in (X, sparse.csr_matrix(X)): - Xtr = _apply_selected(M, Binarizer().transform, sel) + Xtr = _apply_selected(M, Binarizer().transform, sel, dtype=dtype) assert_array_equal(toarray(Xtr), X_expected) + assert_equal(toarray(Xtr).dtype, dtype) def test_transform_selected(): @@ -1565,14 +1566,18 @@ def test_transform_selected(): X_expected = [[1, 2, 1], [0, 1, 1]] _check_apply_selected(X, X_expected, [0]) _check_apply_selected(X, X_expected, [True, False, False]) + _check_apply_selected(X, X_expected, [True, False, False], dtype=np.int) X_expected = [[1, 1, 1], [0, 1, 1]] _check_apply_selected(X, X_expected, [0, 1, 2]) + _check_apply_selected(X, X_expected, [0, 1, 2], dtype=np.int) _check_apply_selected(X, X_expected, [True, True, True]) _check_apply_selected(X, X_expected, "all") + _check_apply_selected(X, X_expected, "all", dtype=np.int) _check_apply_selected(X, X, []) _check_apply_selected(X, X, [False, False, False]) + _check_apply_selected(X, X, [False, False, False], dtype=np.int) def test_transform_selected_copy_arg(): From 7db5cedb62f055835876692dd3481cbe22990af4 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 26 Apr 2017 20:30:03 -0500 Subject: [PATCH 29/36] TST More tests for OneHotEncoder --- sklearn/preprocessing/tests/test_data.py | 81 ++++++++++++++++-------- 1 file changed, 56 insertions(+), 25 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 770b5ff2e4af8..3d361d0d8af91 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1524,16 +1524,19 @@ def test_one_hot_encoder_attr(): enc.fit(X) assert_array_equal(enc.feature_index_range_, [[0, 3], [3, 5], [5, 7]]) assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 1, 1, 2, 2]) + assert_array_equal(enc.n_values_, [11, 16, 2]) - enc = OneHotEncoder(categorical_features=[True, False, True]) - enc.fit(X) - assert_array_equal(enc.feature_index_range_, [[0, 3], [5, 6], [3, 5]]) - assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 2, 2, 1]) + oh = OneHotEncoder('auto-strict', categorical_features=[True, False, True]) + oh.fit(X) + assert_array_equal(oh.feature_index_range_, [[0, 3], [5, 6], [3, 5]]) + assert_array_equal(oh.one_hot_feature_index_, [0, 0, 0, 2, 2, 1]) + assert_array_equal(oh.n_values_, [3, 2, 0]) enc = OneHotEncoder(categorical_features=[False, False, True]) enc.fit(X) assert_array_equal(enc.feature_index_range_, [[2, 3], [3, 4], [0, 2]]) assert_array_equal(enc.one_hot_feature_index_, [2, 2, 0, 1]) + assert_array_equal(enc.n_values_, [2, 0, 0]) def test_one_hot_encoder_dense(): @@ -1604,7 +1607,7 @@ def _run_one_hot(X, X2, cat): return Xtr, X2tr -def _check_one_hot(X, X2, cat, n_features): +def _check_one_hot(X, X2, cat, n_features, X_exp, X2_exp): ind = np.where(cat)[0] # With mask A, B = _run_one_hot(X, X2, cat) @@ -1619,6 +1622,9 @@ def _check_one_hot(X, X2, cat, n_features): assert_array_equal(toarray(A), toarray(C)) assert_array_equal(toarray(B), toarray(D)) + assert_array_equal(toarray(A), X_exp) + assert_array_equal(toarray(B), X2_exp) + def test_one_hot_encoder_string(): X = [['cat', 'domestic'], ['wolf', 'wild']] @@ -1633,15 +1639,30 @@ def test_one_hot_encoder_categorical_features(): X2 = np.array([[1, 1, 1]]) cat = [True, False, False] - _check_one_hot(X, X2, cat, 4) + X_exp = [[0, 1, 2, 1], [1, 0, 1, 1]] + X2_exp = [[0, 0, 1, 1]] + _check_one_hot(X, X2, cat, 4, X_exp, X2_exp) # Edge case: all non-categorical cat = [False, False, False] - _check_one_hot(X, X2, cat, 3) + _check_one_hot(X, X2, cat, 3, X, X2) # Edge case: all categorical + X_exp = [[0, 1, 0, 1, 1], [1, 0, 1, 0, 1]] + X2_exp = [[0, 0, 1, 0, 1]] cat = [True, True, True] - _check_one_hot(X, X2, cat, 5) + _check_one_hot(X, X2, cat, 5, X_exp, X2_exp) + + +def test_one_hot_encoder_dtypes(): + # Verify that we can control the output dtype of the transform + X = np.array([['cat', 2.1, 1], ['dog', 1, 3], ['mouse', 1, 2]], dtype='O') + + for dtype in [np.int8, np.float, np.bool]: + for sparse in [True, False]: + oh = OneHotEncoder('auto-strict', dtype=dtype, sparse=sparse) + X_tr = oh.fit_transform(X) + assert_equal(X_tr.dtype, dtype) def test_one_hot_encoder_unknown_transform_int(): @@ -1656,29 +1677,26 @@ def test_one_hot_encoder_unknown_transform_int(): assert_raises(ValueError, oh.transform, y) assert_array_equal(X, X_orig) - # Test the ignore option, ignores unknown features. - oh = OneHotEncoder(handle_unknown='ignore') - oh.fit(X) - assert_array_equal( - oh.transform(y).toarray(), - np.array([[1., 0., 0., 0., 1., 0., 0.]])) - assert_array_equal(X, X_orig) - # Test that there's no error for integer features in the auto range y = [[0, 1, 1]] - assert_array_equal(oh.transform(y).toarray(), - np.array([[1., 0., 0., 0., 1., 0., 0.]])) + assert_array_equal(toarray(oh.transform(y)), [[1, 0, 0, 0, 1, 0, 0]]) # But we do error when fit with "auto-strict" oh = OneHotEncoder(values='auto-strict', handle_unknown='error') oh.fit(X) assert_raises(ValueError, oh.transform, y) + # Test the ignore option, ignores unknown features. + oh = OneHotEncoder(handle_unknown='ignore') + oh.fit(X) + assert_array_equal(toarray(oh.transform(y)), [[1, 0, 0, 0, 1, 0, 0]]) + assert_array_equal(X, X_orig) + def test_one_hot_encoder_unknown_transform_object(): - X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]], + X = np.array([['cat', 2.1, 1], ['dog', 1.1, 3], ['mouse', 1.1, 2]], dtype=np.object) - y = np.array([['ET', 1, 1]], dtype=np.object) + y = np.array([['ET', 2.1, 1]], dtype=np.object) X_orig = X.copy() # Verify X is not modified # Test that one hot encoder raises error for unknown features @@ -1686,18 +1704,31 @@ def test_one_hot_encoder_unknown_transform_object(): oh = OneHotEncoder(handle_unknown='error') oh.fit(X) assert_raises(ValueError, oh.transform, y) + assert_array_equal(X, X_orig) # Test the ignore option, ignores unknown features. oh = OneHotEncoder(handle_unknown='ignore') oh.fit(X) - assert_array_equal( - oh.transform(y).toarray(), - np.array([[0., 0., 0., 0., 0., 1., 0., 0.]])) + assert_array_equal(oh.transform(y).toarray(), [[0, 0, 0, 0, 1, 1, 0, 0]]) assert_array_equal(X, X_orig) # Raise error if handle_unknown is neither ignore nor error. - oh = OneHotEncoder(handle_unknown='42') - oh.fit(X) + oh = OneHotEncoder(handle_unknown='42').fit(X) + assert_raises(ValueError, oh.transform, y) + assert_array_equal(X, X_orig) + + # Check that in-range integer features are okay in object arrays + y = np.array([['cat', 2.1, 0]], dtype=np.object) + oh = OneHotEncoder(handle_unknown='error').fit(X) + assert_array_equal(oh.transform(y).toarray(), [[1, 0, 0, 0, 1, 0, 0, 0]]) + + # "in-range" but not in-training-data float features will error + y = np.array([['cat', 1.8, 1]], dtype=np.object) + oh = OneHotEncoder(handle_unknown='error').fit(X) + assert_raises(ValueError, oh.transform, y) + + # A transform on in-range integers errors in 'auto-strict' mode. + oh = OneHotEncoder(values='auto-strict', handle_unknown='error').fit(X) assert_raises(ValueError, oh.transform, y) From ac9e455c88763f66835e0967fe19df76d484e2b3 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 26 Apr 2017 20:54:21 -0500 Subject: [PATCH 30/36] DOC Add What's new and polish docstring for OHE --- doc/whats_new.rst | 12 ++++++++- sklearn/preprocessing/data.py | 33 +++++++++++++----------- sklearn/preprocessing/tests/test_data.py | 4 +-- 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 9a092310f4924..d86f5fa0cc7ed 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -171,6 +171,16 @@ Enhancements removed by setting it to `None`. :issue:`7674` by:user:`Yichuan Liu `. + - :class:`preprocessing.OneHotEncoder` now fits and transforms inputs of + any numerical or string type instead of only integer arrays. + It has addtional fitted attributes ``feature_index_range_`` and + ``one_hot_feature_index_``. The ``feature_indices_`` has been deprecated. + The ``n_values`` parameter is deprecated in favor of ``values``. + In addition to previous allowed values, ``values`` accepts "auto-strict" + to fit to only observed categories as well as lists of lists of categories. + :issue:`7327` and :issue:`8793` by :user:`Vighnesh Birodkar ` + and :user:`Stephen Hoover `. + Bug fixes ......... - Fixed a bug where :class:`sklearn.ensemble.IsolationForest` uses an @@ -5070,4 +5080,4 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Anish Shah: https://github.com/AnishShah .. _Neeraj Gangwar: http://neerajgangwar.in -.. _Arthur Mensch: https://amensch.fr \ No newline at end of file +.. _Arthur Mensch: https://amensch.fr diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index ac705501f3180..1d37521d4e786 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1629,6 +1629,8 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, Dense array or sparse matrix. transform : callable A callable transform(X) -> X_transformed + dtype : dtype + Cast outputs to this data type copy : boolean, optional Copy X even if it could be avoided. selected: "all" or array of indices or mask @@ -1678,7 +1680,7 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, class OneHotEncoder(BaseEstimator, TransformerMixin): - """Encode categorical integer features using a one-hot aka one-of-K scheme. + """Encode categorical features using a one-hot aka one-of-K scheme. The input to this transformer should be a matrix of integers or strings, denoting the values taken on by categorical (discrete) features. The @@ -1696,7 +1698,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- values : 'auto', 'auto-strict', int, List[int], or List[List[objects]] - - 'auto' : Determine set of values from training data. + - 'auto' (default) : Determine set of values from training data. If values are integers, then allowed values will be between 0 and the maximum value in the data. - 'auto-strict' : Determine set of values from the training data. @@ -1714,16 +1716,15 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Non-categorical features are always stacked to the right of the matrix. - dtype : number type, default=np.float + dtype : number type, default=np.float64 Desired dtype of output. sparse : boolean, default=True Will return sparse matrix if set True else will return an array. handle_unknown : str, 'error' or 'ignore' - - 'ignore': Ignore all unknown feature values. - - 'error': Raise an error when the value of a feature was not - in the original fit data (or given through ``values``). + Whether to raise an error or ignore if an unknown categorical + feature is present during transform. Attributes ---------- @@ -1745,8 +1746,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Examples -------- - Given a dataset with three features and four samples, we let the encoder - find the maximum value per feature and transform the data to a binary + Given a dataset with two features and three samples, we let the encoder + find the categories in each feature and transform the data to a binary one-hot encoding. >>> from sklearn.preprocessing import OneHotEncoder @@ -1863,21 +1864,22 @@ def _check_values(self, values, n_features): # Input "auto": determine values automatically if values not in ['auto', 'auto-strict']: raise ValueError(error_msg) - elif (isinstance(values, list) or - isinstance(values, np.ndarray)): + elif isinstance(values, list) or isinstance(values, np.ndarray): if len(values) != n_features: raise ValueError("Shape mismatch: if values is a list," " it has to be of length (n_features).") - # All entries are arrays or lists - scalar_vals = [np.isscalar(val) for val in values] - if any(scalar_vals): + # All entries must be either arrays or lists here + if any([np.isscalar(val) for val in values]): raise ValueError(error_msg) else: raise TypeError(error_msg) def _initialize_values(self): - """Standardize the `values` input""" + """Standardize the `values` input + + Output is either a string or a list of arrays. + """ if self.n_values is not None: warnings.warn('`n_values` has been renamed to `values`.' 'The parameter `n_values` has been deprecated ' @@ -1961,7 +1963,8 @@ def transform(self, X, y=None): Returns ------- out : array, shape[n_samples, n_features_new] - `X` encoded using the one-hot scheme. + `X` encoded using the one-hot scheme. Will be a CSR sparse + array if `self.sparse` is True. """ if self.handle_unknown not in ['ignore', 'error']: template = ("handle_unknown should be either 'error' " diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 3d361d0d8af91..f857fb943898e 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1659,8 +1659,8 @@ def test_one_hot_encoder_dtypes(): X = np.array([['cat', 2.1, 1], ['dog', 1, 3], ['mouse', 1, 2]], dtype='O') for dtype in [np.int8, np.float, np.bool]: - for sparse in [True, False]: - oh = OneHotEncoder('auto-strict', dtype=dtype, sparse=sparse) + for sp in [True, False]: + oh = OneHotEncoder('auto-strict', dtype=dtype, sparse=sp) X_tr = oh.fit_transform(X) assert_equal(X_tr.dtype, dtype) From 25250197650faa3e3ac2294eed4fdf7841f0adb7 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 3 May 2017 19:42:53 -0500 Subject: [PATCH 31/36] Deprecate active_features_ --- sklearn/preprocessing/data.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 1d37521d4e786..56a1ff3ada579 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1736,10 +1736,6 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): ``one_hot_feature_index_[i]`` specifies which feature of the input is encoded by column `i` in the one-hot encoded array. - active_features_ : array - Indices for active features, meaning values that actually occur - in the training set. Only available when n_values is ``'auto'``. - n_values_ : array of shape (n_features,) Number of categories per feature. Has value `0` for non-categorical features. @@ -1824,7 +1820,7 @@ def fit(self, X, y=None): start, end = 0, 0 for i_cat, i_feat in enumerate(np.where(categorical)[0]): if np.isscalar(self._values) and self._values == 'auto': - end = start + self.n_active_features_[i_cat] + end = start + self._n_active_features_[i_cat] else: end = start + len(self._label_encoders[i_cat].classes_) self.feature_index_range_[i_feat] = start, end @@ -1948,9 +1944,9 @@ def _fit(self, X): this_col_mask = np.zeros(n_classes, dtype=bool) this_col_mask[int_classes] = True active_features.append(this_col_mask) - self.n_active_features_ = np.array([a.sum() - for a in active_features]) - self.active_features_ = np.where(np.hstack(active_features))[0] + self._n_active_features_ = np.array([a.sum() + for a in active_features]) + self._active_features_ = np.where(np.hstack(active_features))[0] def transform(self, X, y=None): """Encode the selected categorical features using the one-hot scheme. @@ -2022,10 +2018,19 @@ def _transform(self, X): dtype=self.dtype).tocsr() if np.isscalar(self._values) and self._values == 'auto': - out = out[:, self.active_features_] + out = out[:, self._active_features_] return out if self.sparse else out.toarray() + @property + def active_features_(self): + warnings.warn('The property `active_features_` is deprecated and' + ' will be removed in version 0.21') + if not hasattr(self, '_active_features_'): + raise AttributeError("'OneHotEncoder' object has no attribute " + "'active_features_'.") + return self._active_features_ + @property def feature_indices_(self): # This is very similar to the current attribute From 7a53fe843ca776a23796279aafac768fc56e0191 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 3 May 2017 19:56:56 -0500 Subject: [PATCH 32/36] Switch from auto-strict to error-strict --- sklearn/preprocessing/data.py | 50 +++++++++++++----------- sklearn/preprocessing/tests/test_data.py | 18 +++++---- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 56a1ff3ada579..3a40fdb5c4dd5 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1697,12 +1697,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - values : 'auto', 'auto-strict', int, List[int], or List[List[objects]] + values : 'auto', int, List[int], or List[List[objects]] - 'auto' (default) : Determine set of values from training data. - If values are integers, then allowed values will be between - 0 and the maximum value in the data. - - 'auto-strict' : Determine set of values from the training data. - Only values in the original training data are valid. - int : values are in ``range(values)`` for all features - list of ints : values for feature ``i`` are in ``range(values[i])`` - list of lists : values for feature ``i`` are in ``values[i]`` @@ -1722,9 +1718,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): sparse : boolean, default=True Will return sparse matrix if set True else will return an array. - handle_unknown : str, 'error' or 'ignore' - Whether to raise an error or ignore if an unknown categorical - feature is present during transform. + handle_unknown : {'error', 'error-strict', 'ignore'} + - 'ignore': Ignore all unknown feature values. + - 'error': Raise an error when the value of an integer feature is more + than the maximum value seen during fit or less than zero, or when + the value of a non-integer feature was unseen during ``fit``. + - 'error-strict': Raise an error when the value of a feature is unseen + during ``fit``. Attributes ---------- @@ -1799,6 +1799,15 @@ def fit(self, X, y=None): ------- self """ + if self.handle_unknown not in ['ignore', 'error', 'error-strict']: + template = ("handle_unknown should be either 'error', " + "'error-strict', or 'ignore', got %s") + raise ValueError(template % self.handle_unknown) + elif self.handle_unknown == 'error': + warnings.warn('The behavior of handle_unknown="error" is ' + 'deprecated and will be changed to be the same ' + 'as "error-strict" in version 0.21') + X = check_array(X, dtype=None, accept_sparse='csc', copy=False) n_samples, n_features = X.shape self.n_features_ = n_features @@ -1819,7 +1828,7 @@ def fit(self, X, y=None): start, end = 0, 0 for i_cat, i_feat in enumerate(np.where(categorical)[0]): - if np.isscalar(self._values) and self._values == 'auto': + if np.isscalar(self._values) and self.handle_unknown == 'error': end = start + self._n_active_features_[i_cat] else: end = start + len(self._label_encoders[i_cat].classes_) @@ -1854,11 +1863,10 @@ def _check_values(self, values, n_features): converted to lists of arrays before getting here. This should run after `_initialize_values`. """ - error_msg = ("`values` should be 'auto', 'auto-strict', an integer, " + error_msg = ("`values` should be 'auto', an integer, " "a list of integers or a list of list") if isinstance(values, six.string_types): - # Input "auto": determine values automatically - if values not in ['auto', 'auto-strict']: + if values != 'auto': raise ValueError(error_msg) elif isinstance(values, list) or isinstance(values, np.ndarray): if len(values) != n_features: @@ -1910,7 +1918,7 @@ def _fit(self, X): for i in range(n_features): le = self._label_encoders[i] - if np.isscalar(self._values) and self._values == 'auto': + if np.isscalar(self._values) and self.handle_unknown == 'error': # For integer features, allow integers between # 0 and column max. The transform will still only # return dummy columns for integers present in training data. @@ -1920,18 +1928,19 @@ def _fit(self, X): if np.min(_auto_int_classes[i]) < 0: msg = ('Column %s has value(s) less than zero; all ' 'integer columns must have minimum value ' - '0 when value="auto".') + '0 when value="auto" and ' + 'handle_unknown="error".') raise ValueError(msg) n_classes = np.max(_auto_int_classes[i]) + 1 le.fit(np.arange(n_classes)) else: le.fit(X[:, i]) - elif np.isscalar(self._values) and self._values == 'auto-strict': + elif np.isscalar(self._values): le.fit(X[:, i]) else: le.fit(self._values[i]) - if np.isscalar(self._values) and self._values == 'auto': + if np.isscalar(self._values) and self.handle_unknown == 'error': # Record which integer features were present in training # data so we can restrict output columns. active_features = [] @@ -1962,11 +1971,6 @@ def transform(self, X, y=None): `X` encoded using the one-hot scheme. Will be a CSR sparse array if `self.sparse` is True. """ - if self.handle_unknown not in ['ignore', 'error']: - template = ("handle_unknown should be either 'error' " - "or 'ignore', got %s") - raise ValueError(template % self.handle_unknown) - X = check_array(X, accept_sparse='csc', dtype=None, copy=False) if X.shape[1] != self.n_features_: raise ValueError("Input data must have %s " @@ -1981,7 +1985,7 @@ def _transform(self, X): X_int = np.zeros_like(X, dtype=np.int32) # Recode all columns of input data as integers - if self.handle_unknown == 'error': + if self.handle_unknown in ['error', 'error-strict']: for i, le in enumerate(self._label_encoders): try: X_int[:, i] = le.transform(X[:, i]) @@ -2017,7 +2021,7 @@ def _transform(self, X): shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() - if np.isscalar(self._values) and self._values == 'auto': + if np.isscalar(self._values) and self.handle_unknown == 'error': out = out[:, self._active_features_] return out if self.sparse else out.toarray() diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index f857fb943898e..628db2d4f0d08 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1526,7 +1526,8 @@ def test_one_hot_encoder_attr(): assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 1, 1, 2, 2]) assert_array_equal(enc.n_values_, [11, 16, 2]) - oh = OneHotEncoder('auto-strict', categorical_features=[True, False, True]) + oh = OneHotEncoder('auto', handle_unknown='error-strict', + categorical_features=[True, False, True]) oh.fit(X) assert_array_equal(oh.feature_index_range_, [[0, 3], [5, 6], [3, 5]]) assert_array_equal(oh.one_hot_feature_index_, [0, 0, 0, 2, 2, 1]) @@ -1660,7 +1661,8 @@ def test_one_hot_encoder_dtypes(): for dtype in [np.int8, np.float, np.bool]: for sp in [True, False]: - oh = OneHotEncoder('auto-strict', dtype=dtype, sparse=sp) + oh = OneHotEncoder('auto', handle_unknown='error-strict', + dtype=dtype, sparse=sp) X_tr = oh.fit_transform(X) assert_equal(X_tr.dtype, dtype) @@ -1681,8 +1683,8 @@ def test_one_hot_encoder_unknown_transform_int(): y = [[0, 1, 1]] assert_array_equal(toarray(oh.transform(y)), [[1, 0, 0, 0, 1, 0, 0]]) - # But we do error when fit with "auto-strict" - oh = OneHotEncoder(values='auto-strict', handle_unknown='error') + # But we do error when set to "error-strict" + oh = OneHotEncoder(values='auto', handle_unknown='error-strict') oh.fit(X) assert_raises(ValueError, oh.transform, y) @@ -1713,8 +1715,8 @@ def test_one_hot_encoder_unknown_transform_object(): assert_array_equal(X, X_orig) # Raise error if handle_unknown is neither ignore nor error. - oh = OneHotEncoder(handle_unknown='42').fit(X) - assert_raises(ValueError, oh.transform, y) + oh = OneHotEncoder(handle_unknown='42') + assert_raises(ValueError, oh.fit, X) assert_array_equal(X, X_orig) # Check that in-range integer features are okay in object arrays @@ -1727,8 +1729,8 @@ def test_one_hot_encoder_unknown_transform_object(): oh = OneHotEncoder(handle_unknown='error').fit(X) assert_raises(ValueError, oh.transform, y) - # A transform on in-range integers errors in 'auto-strict' mode. - oh = OneHotEncoder(values='auto-strict', handle_unknown='error').fit(X) + # A transform on in-range integers errors in 'error-strict' mode. + oh = OneHotEncoder(values='auto', handle_unknown='error-strict').fit(X) assert_raises(ValueError, oh.transform, y) From 05af448450ed3a494493d9b67a0d3e27d8b98de5 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 3 May 2017 20:04:48 -0500 Subject: [PATCH 33/36] Deprecate integer and list of integer inputs to `values` --- sklearn/preprocessing/data.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 3a40fdb5c4dd5..c120be65bb73f 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1697,10 +1697,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - values : 'auto', int, List[int], or List[List[objects]] + values : 'auto' or List[List[objects]] - 'auto' (default) : Determine set of values from training data. - - int : values are in ``range(values)`` for all features - - list of ints : values for feature ``i`` are in ``range(values[i])`` - list of lists : values for feature ``i`` are in ``values[i]`` categorical_features : "all" or array of indices or mask @@ -1887,9 +1885,9 @@ def _initialize_values(self): if self.n_values is not None: warnings.warn('`n_values` has been renamed to `values`.' 'The parameter `n_values` has been deprecated ' - 'and will be removed in version 0.21, use the' + 'and will be removed in version 0.21; use the ' 'parameter `values` instead and specify the ' - 'expected values for each feature') + 'expected values for each feature.') values = self.n_values else: values = self.values @@ -1897,9 +1895,15 @@ def _initialize_values(self): # Convert `int` and `Sequence[int]` inputs to `List[Array[int]]` if (not isinstance(values, six.string_types) and np.isscalar(values)): + warnings.warn('Integer input to `values` is deprecated and' + ' will be removed in version 0.21. Specify a ' + 'list of allowed values for each feature instead.') values = np.ones(self.n_features_cat_, dtype=int) * values if (not isinstance(values, six.string_types) and np.isscalar(values[0])): + warnings.warn('List of integer input to `values` is deprecated and' + ' will be removed in version 0.21. Specify a ' + 'list of allowed values for each feature instead.') values = [np.arange(v, dtype=np.int) for v in values] return values From d9d77aed69a4c58b640f4aa500bc189101c2275a Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 3 May 2017 21:58:25 -0500 Subject: [PATCH 34/36] Address CR --- doc/modules/preprocessing.rst | 24 ++++----- doc/whats_new.rst | 23 ++++++--- sklearn/preprocessing/data.py | 62 +++++++++++++++--------- sklearn/preprocessing/tests/test_data.py | 58 ++++++++++++++++++---- 4 files changed, 116 insertions(+), 51 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index cf5c312eb5e06..bf76d499b464b 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -378,10 +378,10 @@ Encoding categorical features Often features are not given as continuous values but categorical. For example a person could have features ``["male", "female"]``, ``["from Europe", "from US", "from Asia"]``, -``["uses Firefox", "uses Chrome", "uses Safari", "uses Internet Explorer"]``. +``["Firefox", "Chrome", "Safari", "Internet Explorer"]``. Such features can be efficiently coded as integers, for instance -``["male", "from US", "uses Internet Explorer"]`` could be expressed as -``[0, 1, 3]`` while ``["female", "from Asia", "uses Chrome"]`` would be +``["male", "from US", "Internet Explorer"]`` could be expressed as +``[0, 1, 3]`` while ``["female", "from Asia", "Chrome"]`` would be ``[1, 2, 1]``. Such integer representation can not be used directly with scikit-learn estimators, as these @@ -397,13 +397,13 @@ only one active. Continuing the example above:: >>> enc = preprocessing.OneHotEncoder() - >>> enc.fit([['female', 'from US', 'uses Chrome'], - ... ['male', 'from Asia', 'uses Firefox']]) \ + >>> enc.fit([['female', 'from US', 'Chrome'], + ... ['male', 'from Asia', 'Firefox']]) \ ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values='auto') - >>> enc.transform([['female', 'from Asia', 'uses Firefox']]).toarray() + >>> enc.transform([['female', 'from Asia', 'Firefox']]).toarray() array([[ 1., 0., 1., 0., 0., 1.]]) By default, how many values each feature can take is inferred automatically from the dataset. @@ -417,21 +417,21 @@ values are the continents and the last values are web browsers. Note that, if there is a possibilty that the training data might have missing categorical features, one has to explicitly set ``n_values``. For example, - >>> browsers = ['uses Internet Explorer', 'uses Chrome' , 'uses Safari', 'uses Firefox'] + >>> browsers = ['Internet Explorer', 'Chrome' , 'Safari', 'Firefox'] >>> genders = ['male', 'female'] >>> locations = ['from Europe', 'from Asia', 'from US'] >>> enc = preprocessing.OneHotEncoder(values=[genders, locations, browsers]) - >>> # Note that for there are missing categorical values for the 2nd and 3rd - >>> # feature - >>> enc.fit([['female', 'from US', 'uses Chrome'], - ... ['male', 'from Asia', 'uses Internet Explorer']]) \ + >>> # Note that for there are missing categorical values for the + >>> # 2nd and 3rd feature + >>> enc.fit([['female', 'from US', 'Chrome'], + ... ['male', 'from Asia', 'Internet Explorer']]) \ ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values=[...]) - >>> enc.transform([['male', 'from Europe', 'uses Safari']]).toarray() + >>> enc.transform([['male', 'from Europe', 'Safari']]).toarray() array([[ 0., 1., 0., 1., 0., 0., 0., 0., 1.]]) See :ref:`dict_feature_extraction` for categorical features that are represented diff --git a/doc/whats_new.rst b/doc/whats_new.rst index d86f5fa0cc7ed..bd3a5def36675 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -173,13 +173,13 @@ Enhancements - :class:`preprocessing.OneHotEncoder` now fits and transforms inputs of any numerical or string type instead of only integer arrays. - It has addtional fitted attributes ``feature_index_range_`` and - ``one_hot_feature_index_``. The ``feature_indices_`` has been deprecated. - The ``n_values`` parameter is deprecated in favor of ``values``. - In addition to previous allowed values, ``values`` accepts "auto-strict" - to fit to only observed categories as well as lists of lists of categories. - :issue:`7327` and :issue:`8793` by :user:`Vighnesh Birodkar ` - and :user:`Stephen Hoover `. + It has addtional fitted attributes ``feature_index_range_``, + ``one_hot_feature_index_``, and ``categories_``. + In addition to previous allowed values, ``handle_unknown`` accepts "error-strict" + to error if any unknown values are seen during tranformation. + :issue:`7327` and :issue:`8793` by + :user:`Vighnesh Birodkar ` and + :user:`Stephen Hoover `. Bug fixes ......... @@ -339,6 +339,15 @@ API changes summary the weighted impurity decrease from splitting is no longer alteast ``min_impurity_decrease``. :issue:`8449` by `Raghav RV_` + - In :class:`preprocessing.OneHotEncoder`, deprecate the + ``feature_indices_`` and ``active_features_`` attributes. + Deprecate integer and list of integer inputs to ``values`` + in favor of lists of lists of categories. + The present behavior of ``handle_unknown="error"`` will + change to be the same as ``handle_unknown="error-strict"`` in v0.21. + :issue:`7327` and :issue:`8793` by + :user:`Vighnesh Birodkar ` and + :user:`Stephen Hoover `. .. _changes_0_18_1: diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index c120be65bb73f..d02b9015408f7 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1698,7 +1698,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- values : 'auto' or List[List[objects]] - - 'auto' (default) : Determine set of values from training data. + - 'auto' (default) : Encoded values are those found in training data. - list of lists : values for feature ``i`` are in ``values[i]`` categorical_features : "all" or array of indices or mask @@ -1726,16 +1726,20 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Attributes ---------- - feature_index_range_ : array, shape [n_feature, 2] + feature_index_range_ : array, shape (n_feature, 2) ``feature_index_range_[i]`` specifies the range of column indices occupied by the input feature `i` in the one-hot encoded array. - one_hot_feature_index_ : array, shape [n_features_new] + one_hot_feature_index_ : array, shape (n_features_new,) ``one_hot_feature_index_[i]`` specifies which feature of the input - is encoded by column `i` in the one-hot encoded array. + is encoded by column ``i`` in the one-hot encoded array. + + categories_ : array, shape (n_features_new,) + np.object array containing the category encoded in each feature + of the output (or None for non-categorical features) n_values_ : array of shape (n_features,) - Number of categories per feature. Has value `0` for + Number of encoded categories per feature. Has value `0` for non-categorical features. Examples @@ -1752,12 +1756,14 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values='auto') >>> enc.n_values_ - array([ 3, 18]) + array([ 3, 3]) >>> enc.feature_index_range_ array([[0, 3], [3, 6]]) >>> enc.one_hot_feature_index_ array([0, 0, 0, 1, 1, 1]) + >>> enc.categories_ + array(['cat', 'dog', 'mouse', 4, 15, 17], dtype=object) >>> enc.transform([['dog', 4]]).toarray() array([[ 0., 1., 0., 1., 0., 0.]]) @@ -1804,7 +1810,7 @@ def fit(self, X, y=None): elif self.handle_unknown == 'error': warnings.warn('The behavior of handle_unknown="error" is ' 'deprecated and will be changed to be the same ' - 'as "error-strict" in version 0.21') + 'as "error-strict" in version 0.21', FutureWarning) X = check_array(X, dtype=None, accept_sparse='csc', copy=False) n_samples, n_features = X.shape @@ -1832,24 +1838,29 @@ def fit(self, X, y=None): end = start + len(self._label_encoders[i_cat].classes_) self.feature_index_range_[i_feat] = start, end start = end - num_cat = np.sum(categorical) - non_cat_indices = np.arange(start, start + n_features - num_cat) + num_cat_cols = np.sum(categorical) + non_cat_indices = np.arange(start, start + n_features - num_cat_cols) self.feature_index_range_[~categorical, 0] = non_cat_indices self.feature_index_range_[~categorical, 1] = non_cat_indices + 1 # Record which column of input data corresponds # to each column of output data - n_expanded_cols = end + n_features - num_cat - self.one_hot_feature_index_ = np.empty(n_expanded_cols, dtype=np.int) - for i in range(n_features): - s, e = self.feature_index_range_[i] - self.one_hot_feature_index_[s:e] = i + n_cats = np.diff(self.feature_index_range_, axis=1).ravel() + inp_order = np.argsort(self.feature_index_range_[:, 0]) + self.one_hot_feature_index_ = np.repeat(inp_order, n_cats[inp_order]) # Count categories per feature - n_val = len(non_cat_indices) * [0] - if hasattr(self, '_label_encoders'): - n_val = [len(le.classes_) for le in self._label_encoders] + n_val - self.n_values_ = np.array(n_val) + self.n_values_ = n_cats.copy() + self.n_values_[~categorical] = 0 + + # Store categories for each output feature + if num_cat_cols == 0: + cats = [] + else: + cats = np.concatenate([le.classes_ for le in self._label_encoders]) + if hasattr(self, '_active_features_'): + cats = cats[self._active_features_] + self.categories_ = np.hstack([cats, len(non_cat_indices) * [None]]) return self @@ -1887,7 +1898,7 @@ def _initialize_values(self): 'The parameter `n_values` has been deprecated ' 'and will be removed in version 0.21; use the ' 'parameter `values` instead and specify the ' - 'expected values for each feature.') + 'expected values for each feature.', FutureWarning) values = self.n_values else: values = self.values @@ -1897,13 +1908,15 @@ def _initialize_values(self): np.isscalar(values)): warnings.warn('Integer input to `values` is deprecated and' ' will be removed in version 0.21. Specify a ' - 'list of allowed values for each feature instead.') + 'list of allowed values for each feature instead.', + FutureWarning) values = np.ones(self.n_features_cat_, dtype=int) * values if (not isinstance(values, six.string_types) and np.isscalar(values[0])): warnings.warn('List of integer input to `values` is deprecated and' ' will be removed in version 0.21. Specify a ' - 'list of allowed values for each feature instead.') + 'list of allowed values for each feature instead.', + FutureWarning) values = [np.arange(v, dtype=np.int) for v in values] return values @@ -2033,7 +2046,7 @@ def _transform(self, X): @property def active_features_(self): warnings.warn('The property `active_features_` is deprecated and' - ' will be removed in version 0.21') + ' will be removed in version 0.21', FutureWarning) if not hasattr(self, '_active_features_'): raise AttributeError("'OneHotEncoder' object has no attribute " "'active_features_'.") @@ -2045,6 +2058,9 @@ def feature_indices_(self): # `feature_index_range_`, but only applies to the # subset of categorical features. warnings.warn('The property `feature_indices_` is deprecated and' - ' will be removed in version 0.21') + ' will be removed in version 0.21', FutureWarning) + if not hasattr(self, '_label_encoders'): + raise AttributeError("'OneHotEncoder' object has no attribute " + "'feature_indices_'.") n_categories = [len(le.classes_) for le in self._label_encoders] return np.cumsum([0] + n_categories) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 628db2d4f0d08..d526b842c961d 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -26,6 +26,7 @@ from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_false +from sklearn.utils.testing import assert_warns from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import assert_allclose @@ -1507,7 +1508,7 @@ def test_one_hot_encoder_sparse(): assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) -def test_one_hot_encoder_error_on_negative(): +def test_one_hot_encoder_with_negative_integers(): # Negative numerical values in inputs should raise an exception X_bad = np.array([[-1, "cat"], [10, "mouse"], [5, "cat"]], dtype=np.object) X_good = np.array([[1, "cat"], [10, "mouse"], [5, "cat"]], dtype=np.object) @@ -1516,6 +1517,9 @@ def test_one_hot_encoder_error_on_negative(): ohe = OneHotEncoder().fit(X_good) assert_raises(ValueError, ohe.transform, X_bad) + # Negative values are okay with "error-strict" + OneHotEncoder(handle_unknown='error-strict').fit_transform(X_bad) + def test_one_hot_encoder_attr(): X = np.array([[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]], dtype='O') @@ -1524,20 +1528,56 @@ def test_one_hot_encoder_attr(): enc.fit(X) assert_array_equal(enc.feature_index_range_, [[0, 3], [3, 5], [5, 7]]) assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 1, 1, 2, 2]) - assert_array_equal(enc.n_values_, [11, 16, 2]) + assert_array_equal(enc.n_values_, [3, 2, 2]) + assert_array_equal(enc.categories_, + np.array([1, 5, 10, 7, 15, 'cat', 'mouse'], dtype='O')) - oh = OneHotEncoder('auto', handle_unknown='error-strict', - categorical_features=[True, False, True]) - oh.fit(X) - assert_array_equal(oh.feature_index_range_, [[0, 3], [5, 6], [3, 5]]) - assert_array_equal(oh.one_hot_feature_index_, [0, 0, 0, 2, 2, 1]) - assert_array_equal(oh.n_values_, [3, 2, 0]) + enc = OneHotEncoder('auto', handle_unknown='error-strict', + categorical_features=[True, False, True]) + enc.fit(X) + assert_array_equal(enc.feature_index_range_, [[0, 3], [5, 6], [3, 5]]) + assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 2, 2, 1]) + assert_array_equal(enc.n_values_, [3, 0, 2]) + assert_array_equal(enc.categories_, + np.array([1, 5, 10, 'cat', 'mouse', None], dtype='O')) enc = OneHotEncoder(categorical_features=[False, False, True]) enc.fit(X) assert_array_equal(enc.feature_index_range_, [[2, 3], [3, 4], [0, 2]]) assert_array_equal(enc.one_hot_feature_index_, [2, 2, 0, 1]) - assert_array_equal(enc.n_values_, [2, 0, 0]) + assert_array_equal(enc.n_values_, [0, 0, 2]) + assert_array_equal(enc.categories_, + np.array(['cat', 'mouse', None, None], dtype='O')) + + +def test_one_hot_encoder_deprecations(): + # Check that deprecated features raise warnings + X = [[3, 2, 1], [0, 1, 1]] + + # `handle_unknown`="error" will change in v0.21 + ohe = OneHotEncoder(handle_unknown='error') + assert_warns(FutureWarning, ohe.fit, X) + + # `n_values` is deprecated + ohe = OneHotEncoder(n_values='auto', handle_unknown='ignore') + assert_warns(FutureWarning, ohe.fit, X) + + # Integer input for `values` is deprecated + ohe = OneHotEncoder(values=5, handle_unknown='ignore') + assert_warns(FutureWarning, ohe.fit, X) + + # List of integer input for `values` is deprecated + ohe = OneHotEncoder(values=[5, 5, 5], handle_unknown='ignore') + assert_warns(FutureWarning, ohe.fit, X) + + # `active_features_` is deprecated (and is only available + # when `handle_unknown`="error") + ohe = OneHotEncoder(handle_unknown='error').fit(X) + assert_warns(FutureWarning, getattr, ohe, 'active_features_') + + # `feature_indices_` is deprecated + ohe = OneHotEncoder(handle_unknown='ignore').fit(X) + assert_warns(FutureWarning, getattr, ohe, 'feature_indices_') def test_one_hot_encoder_dense(): From 840382e49d280fe1db46bb48b6a66bc305159eb8 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 3 May 2017 22:21:15 -0500 Subject: [PATCH 35/36] Fix whitespace in doc test --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index d02b9015408f7..bbe8f54c260b1 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1756,7 +1756,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, sparse=True, values='auto') >>> enc.n_values_ - array([ 3, 3]) + array([3, 3]) >>> enc.feature_index_range_ array([[0, 3], [3, 6]]) From ff4b30bfc27b931c219ed13b466afab66b4f8aff Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 3 May 2017 22:56:50 -0500 Subject: [PATCH 36/36] Fix doctest for Python 2.7 --- sklearn/preprocessing/data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bbe8f54c260b1..ee21c6726e620 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1762,8 +1762,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): [3, 6]]) >>> enc.one_hot_feature_index_ array([0, 0, 0, 1, 1, 1]) - >>> enc.categories_ - array(['cat', 'dog', 'mouse', 4, 15, 17], dtype=object) + >>> (enc.categories_ == + ... np.array(['cat', 'dog', 'mouse', 4, 15, 17], dtype='O')).all() + True >>> enc.transform([['dog', 4]]).toarray() array([[ 0., 1., 0., 1., 0., 0.]])