diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 22e4a930bb25b..18e0d8a58a281 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -397,31 +397,37 @@ only one active. Continuing the example above:: >>> enc = preprocessing.OneHotEncoder() - >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS + >>> enc.fit([['female', 'from US', 'uses Chrome'], + ... ['male', 'from Asia', 'uses Firefox']]) # doctest: +ELLIPSIS OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values='auto', sparse=True) - >>> enc.transform([[0, 1, 3]]).toarray() - array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]]) + handle_unknown='error', n_values=None, sparse=True, values='auto') + >>> enc.transform([['female', 'from Asia', 'uses Firefox']]).toarray() + array([[ 1., 0., 1., 0., 0., 1.]]) By default, how many values each feature can take is inferred automatically from the dataset. -It is possible to specify this explicitly using the parameter ``n_values``. +It is possible to specify this explicitly using the parameter ``xvalues``. There are two genders, three possible continents and four web browsers in our dataset. Then we fit the estimator, and transform a data point. -In the result, the first two numbers encode the gender, the next set of three -numbers the continent and the last four the web browser. +In the result, the first two values are genders, the next set of three +values are the continents and the last values are web browsers. Note that, if there is a possibilty that the training data might have missing categorical features, one has to explicitly set ``n_values``. For example, - >>> enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4]) + >>> browsers = ['uses Internet Explorer', 'uses Chrome' , 'uses Safari', 'uses Firefox'] + >>> genders = ['male', 'female'] + >>> locations = ['from Europe', 'from Asia', 'from US'] + >>> enc = preprocessing.OneHotEncoder(values=[genders, locations, browsers]) >>> # Note that for there are missing categorical values for the 2nd and 3rd >>> # feature - >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS + >>> enc.fit([['female', 'from US', 'uses Chrome'], + ... ['male', 'from Asia', 'uses Internet Explorer']]) # doctest: +ELLIPSIS OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values=[2, 3, 4], sparse=True) - >>> enc.transform([[1, 0, 0]]).toarray() - array([[ 0., 1., 1., 0., 0., 1., 0., 0., 0.]]) + handle_unknown='error', n_values=None, sparse=True, + values=[...]) + >>> enc.transform([['male', 'from Europe', 'uses Safari']]).toarray() + array([[ 0., 1., 0., 1., 0., 0., 0., 0., 1.]]) See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as integers. diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 1c3d8db580272..208419f5d354c 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -27,6 +27,8 @@ mean_variance_axis, incr_mean_variance_axis, min_max_axis) from ..utils.validation import check_is_fitted, FLOAT_DTYPES +from .label import LabelEncoder +from ..utils.fixes import in1d, setdiff1d zip = six.moves.zip @@ -1673,28 +1675,29 @@ def add_dummy_feature(X, value=1.0): return np.hstack((np.ones((n_samples, 1)) * value, X)) -def _transform_selected(X, transform, selected="all", copy=True): - """Apply a transform function to portion of selected features - +def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, + return_val=True): + """Apply a function to portion of selected features Parameters ---------- - X : {array-like, sparse matrix}, shape [n_samples, n_features] + X : {array, sparse matrix}, shape [n_samples, n_features] Dense array or sparse matrix. - transform : callable A callable transform(X) -> X_transformed - copy : boolean, optional Copy X even if it could be avoided. - selected: "all" or array of indices or mask Specify which features to apply the transform to. - + return_val : boolean, optional + Whether to return the transformed matrix. If not set `None` is + returned. Returns ------- - X : array or sparse matrix, shape=(n_samples, n_features_new) + X : array or sparse matrix, shape=(n_samples, n_features_new) """ - X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES) + + if copy: + X = X.copy() if isinstance(selected, six.string_types) and selected == "all": return transform(X) @@ -1717,22 +1720,22 @@ def _transform_selected(X, transform, selected="all", copy=True): return transform(X) else: X_sel = transform(X[:, ind[sel]]) - X_not_sel = X[:, ind[not_sel]] + X_not_sel = X[:, ind[not_sel]].astype(dtype) - if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): - return sparse.hstack((X_sel, X_not_sel)) - else: - return np.hstack((X_sel, X_not_sel)) + if return_val: + if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): + return sparse.hstack((X_sel, X_not_sel)) + else: + return np.hstack((X_sel, X_not_sel)) class OneHotEncoder(BaseEstimator, TransformerMixin): """Encode categorical integer features using a one-hot aka one-of-K scheme. - The input to this transformer should be a matrix of integers, denoting - the values taken on by categorical (discrete) features. The output will be - a sparse matrix where each column corresponds to one possible value of one - feature. It is assumed that input features take on values in the range - [0, n_values). + The input to this transformer should be a matrix of integers or strings, + denoting the values taken on by categorical (discrete) features. The + output will be a sparse matrix where each column corresponds to one + possible value of one feature. This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. @@ -1741,15 +1744,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - n_values : 'auto', int or array of ints - Number of values per feature. - - - 'auto' : determine value range from training data. - - int : number of categorical values per feature. - Each feature value should be in ``range(n_values)`` - - array : ``n_values[i]`` is the number of categorical values in - ``X[:, i]``. Each feature value should be - in ``range(n_values[i])`` + values : 'auto', 'seen', int, list of ints, or list of lists of objects + - 'auto' : determine set of values from training data. See the + documentation of `handle_unknown` for which values are considered + acceptable. + - int : values are in ``range(values)`` for all features + - list of ints : values for feature ``i`` are in ``range(values[i])`` + - list of lists : values for feature ``i`` are in ``values[i]`` categorical_features: "all" or array of indices or mask Specify what features are treated as categorical. @@ -1767,23 +1768,23 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Will return sparse matrix if set True else will return an array. handle_unknown : str, 'error' or 'ignore' - Whether to raise an error or ignore if a unknown categorical feature is - present during transform. - Attributes - ---------- - active_features_ : array - Indices for active features, meaning values that actually occur - in the training set. Only available when n_values is ``'auto'``. + - 'ignore': Ignore all unknown feature values. + - 'error': Raise an error when the value of a feature is unseen during + `fit` and out of range of values seen during `fit`. + - 'error-strict': Raise an error when the value of a feature is unseen + during`fit`. - feature_indices_ : array of shape (n_features,) - Indices to feature ranges. - Feature ``i`` in the original data is mapped to features - from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` - (and then potentially masked by `active_features_` afterwards) + copy : bool, default=True + If unset, `X` maybe modified in space. - n_values_ : array of shape (n_features,) - Maximum number of values per feature. + Attributes + ---------- + label_encoders_ : list of size n_features. + The :class:`sklearn.preprocessing.LabelEncoder` objects used to encode + the features. ``self.label_encoders[i]_`` is the LabelEncoder object + used to encode the ith column. The unique features found on column + ``i`` can be accessed using ``self.label_encoders_[i].classes_``. Examples -------- @@ -1793,16 +1794,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import OneHotEncoder >>> enc = OneHotEncoder() - >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ -[1, 0, 2]]) # doctest: +ELLIPSIS + >>> enc.fit([['cat', 4], ['mouse', 15], ['dog', 17]]) # doctest: +ELLIPSIS OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values='auto', sparse=True) - >>> enc.n_values_ - array([2, 3, 4]) - >>> enc.feature_indices_ - array([0, 2, 5, 9]) - >>> enc.transform([[0, 1, 1]]).toarray() - array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]]) + handle_unknown='error', n_values=None, sparse=True, values='auto') + >>> list(enc.label_encoders_[0].classes_) + ['cat', 'dog', 'mouse'] + >>> enc.transform([['dog', 4]]).toarray() + array([[ 0., 1., 0., 1., 0., 0.]]) See also -------- @@ -1811,138 +1809,208 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot encoding of dictionary items or strings. """ - def __init__(self, n_values="auto", categorical_features="all", - dtype=np.float64, sparse=True, handle_unknown='error'): - self.n_values = n_values + + def __init__(self, values='auto', categorical_features="all", + n_values=None, dtype=np.float64, sparse=True, + handle_unknown='error', copy=True): + self.values = values self.categorical_features = categorical_features self.dtype = dtype self.sparse = sparse self.handle_unknown = handle_unknown + self.n_values = n_values + self.copy = copy def fit(self, X, y=None): - """Fit OneHotEncoder to X. + """Fit the CategoricalEncoder to X. Parameters ---------- X : array-like, shape [n_samples, n_feature] - Input array of type int. + Array of ints or strings or both. Returns ------- self """ - self.fit_transform(X) - return self - def _fit_transform(self, X): - """Assumes X contains only categorical features.""" - X = check_array(X, dtype=np.int) - if np.any(X < 0): - raise ValueError("X needs to contain only non-negative integers.") + X = check_array(X, dtype=np.object, accept_sparse='csc', + copy=self.copy) n_samples, n_features = X.shape - if (isinstance(self.n_values, six.string_types) and - self.n_values == 'auto'): - n_values = np.max(X, axis=0) + 1 - elif isinstance(self.n_values, numbers.Integral): - if (np.max(X, axis=0) >= self.n_values).any(): - raise ValueError("Feature out of bounds for n_values=%d" - % self.n_values) - n_values = np.empty(n_features, dtype=np.int) - n_values.fill(self.n_values) - else: - try: - n_values = np.asarray(self.n_values, dtype=int) - except (ValueError, TypeError): - raise TypeError("Wrong type for parameter `n_values`. Expected" - " 'auto', int or array of ints, got %r" - % type(X)) - if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: - raise ValueError("Shape mismatch: if n_values is an array," - " it has to be of shape (n_features,).") - - self.n_values_ = n_values - n_values = np.hstack([[0], n_values]) - indices = np.cumsum(n_values) - self.feature_indices_ = indices - column_indices = (X + indices[:-1]).ravel() - row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - n_features) - data = np.ones(n_samples * n_features) - out = sparse.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() + _apply_selected(X, self._fit, dtype=self.dtype, + selected=self.categorical_features, copy=True, + return_val=False) + return self - if (isinstance(self.n_values, six.string_types) and - self.n_values == 'auto'): - mask = np.array(out.sum(axis=0)).ravel() != 0 - active_features = np.where(mask)[0] - out = out[:, active_features] - self.active_features_ = active_features + def _fit(self, X): + "Assumes `X` contains only catergorical features." - return out if self.sparse else out.toarray() + X = check_array(X, dtype=np.object) + n_samples, n_features = X.shape + + self._n_features = n_features + self.label_encoders_ = [LabelEncoder() for i in range(n_features)] + # Maximum value for each featue + self._max_values = [None for i in range(n_features)] + + if self.n_values is not None: + warnings.warn('The parameter `n_values` is deprecated, use the' + 'parameter `classes_` instead and specify the ' + 'expected values for each feature') + + if isinstance(self.n_values, numbers.Integral): + if (np.max(X, axis=0) >= self.n_values).any(): + raise ValueError("Feature out of bounds for n_values=%d" + % self.n_values) + self.values = self.n_values + else: + try: + n_values = np.asarray(self.n_values, dtype=int) + except (ValueError, TypeError): + raise TypeError("Wrong type for parameter `n_values`." + " Expected 'auto', int or array of ints," + "got %r" % type(X)) + if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if n_values is an array," + " it has to be of shape (n_features,).") + self.values = list(self.n_values) + + error_msg = ("`values` should be 'auto', an integer, a list of" + " integers or a list of list") + + for i in range(n_features): + le = self.label_encoders_[i] + + self._max_values[i] = np.max(X[:, i]) + if self.values == 'auto': + le.fit(X[:, i]) + elif isinstance(self.values, numbers.Integral): + if (np.max(X, axis=0) >= self.values).any(): + raise ValueError("Feature out of bounds for n_values=%d" + % self.values) + le.fit(np.arange(self.values, dtype=np.int)) + elif isinstance(self.values, list): + if len(self.values) != X.shape[1]: + raise ValueError("Shape mismatch: if n_values is a list," + " it has to be of length (n_features).") + if isinstance(self.values[i], list): + le.fit(self.values[i]) + elif isinstance(self.values[i], numbers.Integral): + le.fit(np.arange(self.values[i], dtype=np.int)) + else: + raise ValueError(error_msg) + else: + raise ValueError(error_msg) + + def transform(self, X, y=None): + """Encode the selected categorical features using the one-hot scheme. - def fit_transform(self, X, y=None): - """Fit OneHotEncoder to X, then transform X. + Parameters + ---------- + X : array-like, shape [n_samples, n_feature] + Array of ints or strings or both. - Equivalent to self.fit(X).transform(X), but more convenient and more - efficient. See fit for the parameters, transform for the return value. + Returns + ------- + out : array, shape[n_samples, n_features_new] + `X` encoded using the one-hot scheme. """ - return _transform_selected(X, self._fit_transform, - self.categorical_features, copy=True) + X = check_array(X, dtype=np.object) + + return _apply_selected(X, self._transform, copy=True, + selected=self.categorical_features) def _transform(self, X): - """Assumes X contains only categorical features.""" - X = check_array(X, dtype=np.int) - if np.any(X < 0): - raise ValueError("X needs to contain only non-negative integers.") + "Assumes `X` contains only categorical features." + + X = check_array(X, accept_sparse='csc', dtype=np.object) n_samples, n_features = X.shape + X_int = np.zeros_like(X, dtype=np.int) + X_mask = np.ones_like(X, dtype=np.bool) + + for i in range(n_features): + + valid_mask = in1d(X[:, i], self.label_encoders_[i].classes_) + + if not np.all(valid_mask): + if self.handle_unknown in ['error', 'error-strict']: + diff = setdiff1d(X[:, i], self.label_encoders_[i].classes_) + if self.handle_unknown == 'error-strict': + msg = 'Unknown feature(s) %s in column %d' % (diff, i) + raise ValueError(msg) + else: + if np.all(diff <= self._max_values[i]): + msg = ('Values %s for feature %d are unknown but ' + 'in range. This will raise an error in ' + 'future versions.' % (str(diff), i)) + warnings.warn(FutureWarning(msg)) + X_mask[:, i] = valid_mask + le = self.label_encoders_[i] + X[:, i][~valid_mask] = le.classes_[0] + else: + msg = ('Unknown feature(s) %s in column %d' % + (diff, i)) + raise ValueError(msg) + elif self.handle_unknown == 'ignore': + # Set the problematic rows to an acceptable value and + # continue. The rows are marked in `X_mask` and will be + # removed later. + X_mask[:, i] = valid_mask + X[:, i][~valid_mask] = self.label_encoders_[i].classes_[0] + else: + template = ("handle_unknown should be either 'error' or " + "'ignore', got %s") + raise ValueError(template % self.handle_unknown) + + X_int[:, i] = self.label_encoders_[i].transform(X[:, i]) - indices = self.feature_indices_ - if n_features != indices.shape[0] - 1: - raise ValueError("X has different shape than during fitting." - " Expected %d, got %d." - % (indices.shape[0] - 1, n_features)) - - # We use only those categorical features of X that are known using fit. - # i.e lesser than n_values_ using mask. - # This means, if self.handle_unknown is "ignore", the row_indices and - # col_indices corresponding to the unknown categorical feature are - # ignored. - mask = (X < self.n_values_).ravel() - if np.any(~mask): - if self.handle_unknown not in ['error', 'ignore']: - raise ValueError("handle_unknown should be either error or " - "unknown got %s" % self.handle_unknown) - if self.handle_unknown == 'error': - raise ValueError("unknown categorical feature present %s " - "during transform." % X.ravel()[~mask]) - - column_indices = (X + indices[:-1]).ravel()[mask] + mask = X_mask.ravel() + n_values = [le.classes_.shape[0] for le in self.label_encoders_] + n_values = np.hstack([[0], n_values]) + indices = np.cumsum(n_values) + + column_indices = (X_int + indices[:-1]).ravel()[mask] row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features)[mask] - data = np.ones(np.sum(mask)) + data = np.ones(n_samples * n_features)[mask] + out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() + if (isinstance(self.n_values, six.string_types) and self.n_values == 'auto'): out = out[:, self.active_features_] return out if self.sparse else out.toarray() - def transform(self, X): - """Transform X using one-hot encoding. + @property + def active_features_(self): + warnings.warn('The property `active_features_` is deprecated and' + ' will be removed in version 0.20') + if self.n_values is None: + #TODO: What to do when classes are strings ? + classes = [le.classes_ for le in self.label_encoders_] + classes_max = [np.max(cls) + 1 for cls in classes] + cum_idx = np.cumsum([0] + classes_max) + active_idx = [self.label_encoders_[i].classes_.astype(np.int) + + cum_idx[i] + for i in range(self._n_features)] + + return np.concatenate(active_idx, axis=0).astype(np.int) + else: + raise AttributeError() - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - Input array of type int. + @property + def feature_indices_(self): + warnings.warn('The property `feature_indices_` is deprecated and' + ' will be removed in version 0.20') + classes_max = [np.max(le.classes_) + 1 for le in self.label_encoders_] + return np.cumsum([0] + classes_max) - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array, dtype=int - Transformed input. - """ - return _transform_selected(X, self._transform, - self.categorical_features, copy=True) + @property + def n_values_(self): + warnings.warn('The property `n_values_` is deprecated and' + ' will be removed in version 0.20') + return np.array([le.classes_.shape[0] for le in self.label_encoders_]) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 39360ebdd779f..5944e4ab1f748 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -6,6 +6,7 @@ # License: BSD 3 clause import warnings +import re import numpy as np import numpy.linalg as la from scipy import sparse @@ -32,7 +33,7 @@ from sklearn.utils.testing import skip_if_32bit from sklearn.utils.sparsefuncs import mean_variance_axis -from sklearn.preprocessing.data import _transform_selected +from sklearn.preprocessing.data import _apply_selected from sklearn.preprocessing.data import _handle_zeros_in_scale from sklearn.preprocessing.data import Binarizer from sklearn.preprocessing.data import KernelCenterer @@ -1501,9 +1502,10 @@ def test_one_hot_encoder_sparse(): # test that an error is raised when out of bounds: X_too_large = [[0, 2, 1], [0, 1, 1]] assert_raises(ValueError, enc.transform, X_too_large) - error_msg = "unknown categorical feature present \[2\] during transform." + error_msg = re.escape("Unknown feature(s) [2] in column 1") assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X) + assert_raises(ValueError, OneHotEncoder(values=2).fit_transform, X) # test that error is raised when wrong number of features assert_raises(ValueError, enc.transform, X[:, :-1]) @@ -1513,14 +1515,6 @@ def test_one_hot_encoder_sparse(): # test exception on wrong init param assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) - enc = OneHotEncoder() - # test negative input to fit - assert_raises(ValueError, enc.fit, [[0], [-1]]) - - # test negative input to transform - enc.fit([[0], [1]]) - assert_raises(ValueError, enc.transform, [[0], [-1]]) - def test_one_hot_encoder_dense(): # check for sparse=False @@ -1539,26 +1533,26 @@ def test_one_hot_encoder_dense(): [1., 0., 1., 0., 1.]])) -def _check_transform_selected(X, X_expected, sel): +def _check_apply_selected(X, X_expected, sel): for M in (X, sparse.csr_matrix(X)): - Xtr = _transform_selected(M, Binarizer().transform, sel) + Xtr = _apply_selected(M, Binarizer().transform, sel) assert_array_equal(toarray(Xtr), X_expected) def test_transform_selected(): - X = [[3, 2, 1], [0, 1, 1]] + X = np.array([[3, 2, 1], [0, 1, 1]]) X_expected = [[1, 2, 1], [0, 1, 1]] - _check_transform_selected(X, X_expected, [0]) - _check_transform_selected(X, X_expected, [True, False, False]) + _check_apply_selected(X, X_expected, [0]) + _check_apply_selected(X, X_expected, [True, False, False]) X_expected = [[1, 1, 1], [0, 1, 1]] - _check_transform_selected(X, X_expected, [0, 1, 2]) - _check_transform_selected(X, X_expected, [True, True, True]) - _check_transform_selected(X, X_expected, "all") + _check_apply_selected(X, X_expected, [0, 1, 2]) + _check_apply_selected(X, X_expected, [True, True, True]) + _check_apply_selected(X, X_expected, "all") - _check_transform_selected(X, X, []) - _check_transform_selected(X, X, [False, False, False]) + _check_apply_selected(X, X, []) + _check_apply_selected(X, X, [False, False, False]) def test_transform_selected_copy_arg(): @@ -1571,8 +1565,8 @@ def _mutating_transformer(X): expected_Xtr = [[2, 2], [3, 4]] X = original_X.copy() - Xtr = _transform_selected(X, _mutating_transformer, copy=True, - selected='all') + Xtr = _apply_selected(X, _mutating_transformer, copy=True, + selected='all') assert_array_equal(toarray(X), toarray(original_X)) assert_array_equal(toarray(Xtr), expected_Xtr) @@ -1601,6 +1595,14 @@ def _check_one_hot(X, X2, cat, n_features): assert_array_equal(toarray(B), toarray(D)) +def test_one_hot_encoder_string(): + X = [['cat', 'domestic'], ['wolf', 'wild']] + enc = OneHotEncoder() + enc.fit(X) + Xtr = enc.transform([['cat', 'wild']]) + assert_array_equal(toarray(Xtr), [[1, 0, 0, 1]]) + + def test_one_hot_encoder_categorical_features(): X = np.array([[3, 2, 1], [0, 1, 1]]) X2 = np.array([[1, 1, 1]]) @@ -1623,10 +1625,19 @@ def test_one_hot_encoder_unknown_transform(): # Test that one hot encoder raises error for unknown features # present during transform. - oh = OneHotEncoder(handle_unknown='error') + oh = OneHotEncoder(handle_unknown='error-strict') oh.fit(X) assert_raises(ValueError, oh.transform, y) + # Test that one hot encoder raises warning for unknown but in range + # features + oh = OneHotEncoder(handle_unknown='error') + oh.fit(X) + msg = ('Values [0] for feature 2 are unknown but in range. ' + 'This will raise an error in future versions.') + assert_warns_message(FutureWarning, msg, oh.transform, + np.array([[0, 0, 0]])) + # Test the ignore option, ignores unknown features. oh = OneHotEncoder(handle_unknown='ignore') oh.fit(X) @@ -1634,7 +1645,23 @@ def test_one_hot_encoder_unknown_transform(): oh.transform(y).toarray(), np.array([[0., 0., 0., 0., 1., 0., 0.]])) - # Raise error if handle_unknown is neither ignore or error. + X = np.array([['cat', 2, 1], ['dog', 0, 3], ['mouse', 0, 2]]) + y = np.array([['ET', 1, 1]]) + + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown='error-strict') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) + + # Test the ignore option, ignores unknown features. + oh = OneHotEncoder(handle_unknown='ignore') + oh.fit(X) + assert_array_equal( + oh.transform(y).toarray(), + np.array([[0., 0., 0., 0., 0., 1., 0., 0.]])) + + # Raise error if handle_unknown is neither ignore nor error. oh = OneHotEncoder(handle_unknown='42') oh.fit(X) assert_raises(ValueError, oh.transform, y) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index a10afa6d4d1e3..b97140c10cf86 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -281,13 +281,32 @@ def frombuffer_empty(buf, dtype): frombuffer_empty = np.frombuffer +def _in1d_object(ar1, ar2, invert=False): + # np.argsort(kind='mergesort') is only supported for object types after + # version 1.8. Hence in1d for object arrays needs to be handled differently + values1 = set(ar1) + values2 = set(ar2) + absent_values = values1 - values2 + + present = np.ones_like(ar1, dtype=np.bool) + + for value in absent_values: + present[ar1 == value] = False + + return ~present if invert else present + + if np_version < (1, 8): def in1d(ar1, ar2, assume_unique=False, invert=False): # Backport of numpy function in1d 1.8.1 to support numpy 1.6.2 # Ravel both arrays, behavior for the first array could be different + ar1 = np.asarray(ar1).ravel() ar2 = np.asarray(ar2).ravel() + if ar1.dtype == object or ar2.dtype == object: + return _in1d_object(ar1, ar2, invert) + # This code is significantly faster when the condition is satisfied. if len(ar2) < 10 * len(ar1) ** 0.145: if invert: @@ -443,3 +462,28 @@ def rankdata(a, method='average'): return .5 * (count[dense] + count[dense - 1] + 1) else: from scipy.stats import rankdata + + +if np_version < (1, 8): + # Backport of setdiff1d function as it relies on in1d + def setdiff1d(ar1, ar2, assume_unique=False): + # copy-paste from numpy except for the object type if clause + if assume_unique: + ar1 = np.asarray(ar1).ravel() + else: + # Unique is not supported for object arrays till np version 1.8 + # due to mergesort + if ar1.dtype == object: + ar1 = np.array(sorted(set(ar1))) + else: + ar1 = np.unique(ar1) + + if ar2.dtype == object: + ar2 = np.array(sorted(set(ar2))) + else: + ar2 = np.unique(ar2) + + return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)] + +else: + from numpy import setdiff1d