diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index e09ca0422d8a7..19d70b7e66646 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1254,6 +1254,7 @@ Model validation preprocessing.Normalizer preprocessing.OneHotEncoder preprocessing.OrdinalEncoder + preprocessing.UnaryEncoder preprocessing.PolynomialFeatures preprocessing.PowerTransformer preprocessing.QuantileTransformer diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 54210f2453cb0..d228ddb1cf58d 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -451,6 +451,10 @@ The normalizer instance can then be used on sample vectors as any transformer:: Encoding categorical features ============================= + +Ordinal encoding +---------------- + Often features are not given as continuous values but categorical. For example a person could have features ``["male", "female"]``, ``["from Europe", "from US", "from Asia"]``, @@ -471,11 +475,22 @@ new feature of integers (0 to n_categories - 1):: >>> enc.transform([['female', 'from US', 'uses Safari']]) array([[0., 1., 1.]]) +You can specify the order of the categories by passing the ``categories`` +attribute:: + >>> enc = preprocessing.OrdinalEncoder(categories=[['big', 'small'], + ... ['short', 'tall']]) + >>> X = [['big', 'tall']] + >>> enc.fit_transform(X) # doctest: +ELLIPSIS + array([[0., 1.]]) + Such integer representation can, however, not be used directly with all scikit-learn estimators, as these expect continuous input, and would interpret the categories as being ordered, which is often not desired (i.e. the set of browsers was ordered arbitrarily). +One-hot encoding +---------------- + Another possibility to convert categorical features to features that can be used with scikit-learn estimators is to use a one-of-K, also known as one-hot or dummy encoding. @@ -539,9 +554,73 @@ columns for this feature will be all zeros >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray() array([[1., 0., 0., 0., 0., 0.]]) +See :ref:`dict_feature_extraction` for categorical features that are +represented as a dict, not as scalars. + +.. _unary_encoding: + +Unary encoding +-------------- + +For some ordinal features, it does not necessarily make sense to use +:class:`OrdinalEncoder` if the difference between the ordered categories is +uneven, for example with a feature that takes values in "very short", +"short", "big". + +For such features, it is possible to use a unary encoding, which is +implemented in :class:`UnaryEncoder`. This encoder transforms each ordinal +feature with ``m`` possible values into ``m - 1`` binary features, where the +ith feature is active if x > i. For example:: + + >>> enc = preprocessing.UnaryEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', + max_value='auto', sparse=False) + >>> enc.transform([[0, 1, 3]]) + array([[0., 1., 0., 1., 1., 1.]]) + +Here the first feature with 2 categories is transformed into 1 column, the +second feature with 3 values is transformed into 2 columns, and the third +feature is transformed into 3 columns. + +By default, the number of categories in a feature is inferred automatically +from the dataset by looking for the maximum value. It is possible to specify +this explicitly using the parameter ``max_value``. In particular if the +training data might have missing categorical features, one has to explicitly +set ``max_value``. For example,:: + + >>> enc = preprocessing.UnaryEncoder(max_value=[2, 3, 4]) + >>> # Note that there are missing categorical values for the 2nd and 3rd + >>> # features + >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', + max_value=[2, 3, 4], sparse=False) + >>> enc.transform([[1, 1, 2]]) + array([[1., 1., 0., 1., 1., 0.]]) + +.. note:: + + This encoding is likely to help when used with linear models and + kernel-based models like SVMs with the standard kernels. On the other + hand, this transformation is unlikely to help when using tree-based + models, since those already work on the basis of a particular feature + value being less or bigger than a threshold. + +In case the input variable is not represented as a number from 0 to +``max_value``, it is possible to combine :class:`UnaryEncoder` and +:class:`OrdinalEncoder` into a :class:`Pipeline ` +like so:: + + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import OrdinalEncoder, UnaryEncoder + >>> categories = [['small', 'medium', 'huge']] + >>> pipeline = make_pipeline(OrdinalEncoder(categories), UnaryEncoder()) + >>> X = [['small'], ['medium'], ['huge']] + >>> pipeline.fit_transform(X) + array([[0., 0.], + [1., 0.], + [1., 1.]]) -See :ref:`dict_feature_extraction` for categorical features that are represented -as a dict, not as scalars. .. _preprocessing_discretization: diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 3b4e6e439c934..95cb39dc01b1e 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -183,6 +183,11 @@ Support for Python 3.4 and below has been officially dropped. in the dense case. Also added a new parameter ``order`` which controls output order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_. +- |Feature| Added a new encoder :class:`preprocessing.UnaryEncoding`, useful + for ordinal features with uneven differences between categories. + :issue:`12893` by :user:`Ruxandra Burtica `, :user:`Arjun + Jauhari ` and :user:`Nicolas Hug `. + :mod:`sklearn.tree` ................... - |Feature| Decision Trees can now be plotted with matplotlib using diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index d1d69bde6f4a8..44df9d4a970dc 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -27,6 +27,7 @@ from ._encoders import OneHotEncoder from ._encoders import OrdinalEncoder +from ._encoders import UnaryEncoder from .label import label_binarize from .label import LabelBinarizer @@ -53,6 +54,7 @@ 'Normalizer', 'OneHotEncoder', 'OrdinalEncoder', + 'UnaryEncoder', 'PowerTransformer', 'RobustScaler', 'StandardScaler', diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 3dabd0b3e0cda..694fa7b0d6e8b 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -263,6 +263,8 @@ class OneHotEncoder(_BaseEncoder): -------- sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) encoding of the categorical features. + sklearn.preprocessing.UnaryEncoder : performs a unary encoding of ordinal + data. sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of dictionary items (also handles string-valued features). sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot @@ -766,6 +768,8 @@ class OrdinalEncoder(_BaseEncoder): -------- sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of categorical features. + sklearn.preprocessing.UnaryEncoder : performs a unary encoding of ordinal + data. sklearn.preprocessing.LabelEncoder : encodes target labels with values between 0 and n_classes-1. """ @@ -846,3 +850,245 @@ def inverse_transform(self, X): X_tr[:, i] = self.categories_[i][labels] return X_tr + + +class UnaryEncoder(BaseEstimator, TransformerMixin): + """Encode ordinal integer features using a unary scheme. + + This encoder transforms each ordinal feature with ``m`` possible values + into ``m - 1`` binary features, where the ith feature is active if ``x > + i``. The input to this transformer should be a matrix of non-negative + integers, denoting the values taken on by the ordinal features. + + This encoding may be needed for feeding ordinal features to many + scikit-learn estimators, notably linear models and kernel-based models + like SVMs with the standard kernels. + This transformation is unlikely to help when using with tree-based models, + since those already work on the basis of a particular feature value being + less or greater than a threshold, unlike linear and kernel-based models. + + This encoder encodes all of the features. To only encode a subset of the + features, use :class:`ColumnTransformer + `. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + max_value : 'auto', int or array of ints, optional (default='auto') + Number of categories per feature. + + - 'auto' : determine value range from training data by looking for + the maximum. + - int : number of ordinal values per feature. + Each feature value should be in [0, max_value]. + - array : ``max_value[i]`` is the number of ordinal values in + ``X[:, i]``. Each feature value should be in [0, max_value[i]]. + + dtype : number type, optional (default=np.float) + Desired dtype of output. + + sparse : boolean, optional (default=False) + Will return sparse matrix if set True else will return an array. + + handle_greater : str, 'warn', 'error' or 'clip', optional (default='warn') + Whether to raise an error or clip or warn if an + ordinal feature >= max_value is passed in. + + - 'error': raise error if feature >= max_value is passed in. + - 'clip': all the feature values >= max_value are clipped to + (max_value - 1) during transform. + - 'warn': same as clip but with warning. + + Attributes + ---------- + feature_indices_ : array of shape (n_features + 1,) + Feature ``i`` in the original data is mapped to columns + from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` + + max_value_ : array of shape (n_features,) + Maximum number of values per feature. + + Examples + -------- + Given a dataset with three features and four samples, we let the encoder + find the maximum value per feature and transform the data to a unary + encoding. + + >>> from sklearn.preprocessing import UnaryEncoder + >>> enc = UnaryEncoder() + >>> enc.fit([[0, 0, 3], + ... [1, 1, 0], + ... [0, 2, 1], + ... [1, 0, 2]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', + max_value='auto', sparse=False) + >>> enc.max_value_ + array([2, 3, 4]) + >>> enc.feature_indices_ + array([0, 1, 3, 6]) + >>> enc.transform([[0, 1, 2]]) + array([[0., 1., 0., 1., 1., 0.]]) + + See also + -------- + sklearn.preprocessing.OneHotEncoder : encodes categorical integer features + using a one-hot aka one-of-K scheme. + sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) + encoding of the categorical features. + """ + def __init__(self, max_value="auto", dtype=np.float64, sparse=False, + handle_greater='warn'): + self.max_value = max_value + self.dtype = dtype + self.sparse = sparse + self.handle_greater = handle_greater + + def fit(self, X, y=None): + """Fit UnaryEncoder to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_feature) + Input array of type int. All feature values should be + non-negative otherwise will raise a ValueError. + """ + X = check_array(X, dtype=np.int) + if self.handle_greater not in ['warn', 'error', 'clip']: + raise ValueError("handle_greater should be either 'warn', 'error' " + "or 'clip'. got %s" % self.handle_greater) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + _, n_features = X.shape + + if isinstance(self.max_value, str) and self.max_value == 'auto': + max_value = np.max(X, axis=0) + 1 + elif isinstance(self.max_value, numbers.Integral): + max_value = np.empty(n_features, dtype=np.int) + max_value.fill(self.max_value) + else: + try: + max_value = np.asarray(self.max_value, dtype=int) + except (ValueError, TypeError): + raise TypeError( + "Wrong type for parameter `max_value`. Expected" + " 'auto', int or array of ints, got %r" % self.max_value + ) + if max_value.ndim < 1 or max_value.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if max_value is an array," + " it has to be of shape (n_features,).") + + self.max_value_ = max_value + max_value = np.hstack([[0], max_value - 1]) + indices = np.cumsum(max_value) + self.feature_indices_ = indices + + if self.max_value != 'auto' and self.handle_greater == 'error': + mask = (X >= self.max_value_).ravel() + if np.any(mask): + raise ValueError("handle_greater='error' but found %d feature" + " values which exceeds max_value." + % np.count_nonzero(mask)) + return self + + def transform(self, X): + """Transform X using Ordinal encoding. + + Parameters + ---------- + X : array-like, of shape (n_samples, n_features) + Input array of type int. + All feature values should be non-negative otherwise ValueError + will be raised. + + Returns + ------- + X_tr: array-like or sparse matrix, of shape \ + (n_samples, n_encoded_features) + Transformed input. + """ + check_is_fitted(self, 'max_value_') + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + + indices = self.feature_indices_ + if n_features != indices.shape[0] - 1: + raise ValueError("X has different shape than during fitting." + " Expected %d, got %d." + % (indices.shape[0] - 1, n_features)) + + # We clip those ordinal features of X that are greater than max_value_ + # using mask if self.handle_greater is "clip". + # This means, the row_indices and col_indices corresponding to the + # greater ordinal feature are all filled with ones. + mask = (X >= self.max_value_).ravel() + if np.any(mask): + if self.handle_greater == 'warn': + warnings.warn("Found %d feature values which exceeds " + "max_value during transform, clipping them." + % np.count_nonzero(mask)) + elif self.handle_greater == 'error': + raise ValueError("handle_greater='error' but found %d feature" + " values which exceeds max_value during " + "transform." % np.count_nonzero(mask)) + + X_ceil = np.where(mask.reshape(X.shape), self.max_value_ - 1, X) + column_start = np.tile(indices[:-1], n_samples) + column_end = (indices[:-1] + X_ceil).ravel() + column_indices = np.hstack([np.arange(s, e) for s, e + in zip(column_start, column_end)]) + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + X_ceil.sum(axis=1)) + data = np.ones(X_ceil.ravel().sum()) + out = sparse.csr_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype) + + return out if self.sparse else out.toarray() + + def inverse_transform(self, X): + """Convert the data back to the original representation. + + Parameters + ---------- + X : array-like or sparse matrix of shape \ + (n_samples, n_encoded_features) + The transformed data. + + Returns + ------- + X_tr : array-like of shape (n_samples, n_features) + Inverse transformed array. + """ + + check_is_fitted(self, 'max_value_') + X = check_array(X, accept_sparse='csr', ensure_min_features=0) + + n_samples, _ = X.shape + n_features = len(self.max_value_) + n_encoded_features = self.feature_indices_[-1] + + # validate shape of passed X + msg = ("Shape of the passed X data is not correct. Expected {0} " + "columns, got {1}.") + if X.shape[1] != n_encoded_features: + raise ValueError(msg.format(n_encoded_features, X.shape[1])) + + # return float dtype, even though it will contain int values + X_tr = np.zeros((n_samples, n_features), dtype=np.float) + + for feature_idx, (start, stop) in enumerate(zip( + self.feature_indices_, + self.feature_indices_[1:])): + + # sub = portion of the transformed matrix that corresponds to the + # current feature + sub = X[:, start:stop] + + # the original category is the number or non-zero columns + categories = (sub != 0).sum(axis=1).ravel() + X_tr[:, feature_idx] = categories + + return X_tr diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 792de88aa37de..59fd1e47a80e5 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -20,6 +20,7 @@ from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import OrdinalEncoder +from sklearn.preprocessing import UnaryEncoder def toarray(a): @@ -676,3 +677,285 @@ def test_one_hot_encoder_warning(): enc = OneHotEncoder() X = [['Male', 1], ['Female', 3]] np.testing.assert_no_warnings(enc.fit_transform, X) + + +def _generate_random_features_matrix(n_samples=10, n_features=3, + n_categories_max=3): + rng = np.random.RandomState(6) + X = rng.randint(n_categories_max, size=(n_samples, n_features)) + return X + + +def test_unary_encoder(): + X = np.arange(5).reshape(-1, 1) + enc = UnaryEncoder(5) + Xt = enc.fit_transform(X) + assert_array_equal(Xt, [[0, 0, 0, 0], # 0 + [1, 0, 0, 0], # 1 + [1, 1, 0, 0], # 2 + [1, 1, 1, 0], # 3 + [1, 1, 1, 1]]) # 4 + Xt2 = enc.transform(X) + assert_array_equal(Xt2, Xt) + + +def test_unary_encoder_stack(): + # multiple input features stack to same output + rng = np.random.RandomState(6) + categories = rng.randint(2, 10) + size = rng.randint(1, 10) + n_features = rng.randint(2, 10) + + encoder = UnaryEncoder(categories, sparse=False) + X_multi = _generate_random_features_matrix(size, n_features, categories) + X_multi_t = encoder.fit_transform(X_multi) + assert_equal(X_multi_t.shape, (size, n_features * (categories - 1))) + + expected = np.hstack([encoder.fit_transform(X_multi[:, i:(i + 1)]) + for i in range(X_multi.shape[1])]) + assert_array_equal(expected, X_multi_t) + + +def test_unary_encoder_dense_sparse(): + # test dense output in comparison to sparse results. + rng = np.random.RandomState(6) + categories = rng.randint(1, 10) + size = rng.randint(1, 10) + n_features = rng.randint(2, 10) + + sparse_encoder = UnaryEncoder(categories, sparse=True) + dense_encoder = UnaryEncoder(categories) + + X = _generate_random_features_matrix(size, n_features, categories) + X_trans_sparse = sparse_encoder.fit_transform(X) + X_trans_dense = dense_encoder.fit_transform(X) + + assert_array_equal(X_trans_sparse.toarray(), X_trans_dense) + + +def test_unary_encoder_handle_greater(): + X = np.array([[0, 2, 1], [1, 0, 3], [2, 0, 2]]) + y = np.array([[4, 1, 1]]) + + # Test that encoder raises error for greater features during transform. + encoder = UnaryEncoder(handle_greater='error') + encoder.fit(X) + assert_raises(ValueError, encoder.transform, y) + + encoder = UnaryEncoder(handle_greater='error') + assert_array_equal(encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + # Test that encoder raises error for greater features during fit when + # categories is explicitly set. + encoder = UnaryEncoder(handle_greater='error', max_value=[2, 3, 4]) + assert_raises(ValueError, encoder.fit, X) + + encoder = UnaryEncoder(handle_greater='error', max_value=[2, 3, 4]) + assert_raises(ValueError, encoder.fit_transform, X) + + encoder = UnaryEncoder(handle_greater='error', max_value=[5, 2, 2]) + encoder.fit(y) + assert_array_equal(encoder.transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + encoder = UnaryEncoder(handle_greater='error', max_value=[5, 2, 2]) + assert_array_equal(encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + # Test the clip option. + encoder = UnaryEncoder(handle_greater='clip') + encoder.fit(X) + assert_array_equal( + encoder.transform(y), + np.array([[1., 1., 1., 0., 1., 0., 0.]])) + + encoder = UnaryEncoder(handle_greater='clip', max_value=[3, 2, 2]) + assert_array_equal( + encoder.fit_transform(y), + np.array([[1., 1., 1., 1.]])) + + # Test the warn option. + encoder = UnaryEncoder() + encoder.fit(X) + w = ('Found 1 feature values which exceeds max_value during transform, ' + 'clipping them.') + y_transformed = assert_warns_message(UserWarning, w, encoder.transform, y) + assert_array_equal( + y_transformed, + np.array([[1., 1., 1., 0., 1., 0., 0.]])) + + encoder = UnaryEncoder(max_value=[3, 2, 2]) + y_transformed = assert_warns_message(UserWarning, w, + encoder.fit_transform, y) + assert_array_equal( + y_transformed, + np.array([[1., 1., 1., 1.]])) + + encoder = UnaryEncoder(max_value=[5, 2, 2]) + assert_array_equal( + encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + # Raise error if handle_greater is neither clip nor error. + encoder = UnaryEncoder(handle_greater='42') + assert_raises(ValueError, encoder.fit, y) + + +def test_unary_encoder_errors(): + rng = np.random.RandomState(6) + max_value = rng.randint(2, 10) + size = rng.randint(1, 10) + n_features = rng.randint(2, 10) + delta = rng.randint(1, 10) + + encoder = UnaryEncoder(max_value) + X = _generate_random_features_matrix(size, n_features, max_value) + encoder.fit(X) + + # test that an error is raised when different shape + larger_n_features = n_features + delta + X_too_large = _generate_random_features_matrix(size, larger_n_features, + max_value) + assert_raises(ValueError, encoder.transform, X_too_large) + error_msg = ("X has different shape than during fitting." + " Expected {}, got {}.".format(n_features, larger_n_features)) + assert_raises_regex(ValueError, error_msg, encoder.transform, X_too_large) + + # test that an error is raised when out of bounds + encoder = UnaryEncoder(max_value, handle_greater='error') + X = _generate_random_features_matrix(size, n_features, max_value) + encoder.fit(X) + X[0][0] = max_value + delta + X_out_of_bounds = X + assert_raises(ValueError, encoder.transform, X_out_of_bounds) + error_msg = ("handle_greater='error' but found 1 feature values which " + "exceeds max_value during transform.") + assert_raises_regex(ValueError, error_msg, encoder.transform, + X_out_of_bounds) + + # test exception on wrong init param + assert_raises(TypeError, UnaryEncoder(max_value=np.int).fit, X) + + # test negative input to fit + encoder = UnaryEncoder() + assert_raises(ValueError, encoder.fit, [[0], [-1]]) + + # test negative input to transform + encoder.fit([[0], [1]]) + assert_raises(ValueError, encoder.transform, [[0], [-1]]) + + +def test_unary_encoder_edge_cases(): + EDGE_CASES = [ + ( + [[0], [1], [2]], + np.array([[0, 0], [1, 0], [1, 1]]), + ), + ( + [[0], [0], [1]], + np.array([[0], [0], [1]]), + ), + ( + [[0, 0], [0, 0], [0, 1]], + np.array([[0], [0], [1]]), + ), + ] + + for input_matrix, expected_matrix in EDGE_CASES: + transformed = UnaryEncoder().fit_transform(input_matrix) + assert_array_equal(transformed, expected_matrix) + + +def test_unary_encoder_max_value_int(): + # Test UnaryEncoder's max_value parameter when set as an int. + rng = np.random.RandomState(6) + max_value = rng.randint(2, 10) + size = rng.randint(1, 10) + n_features = rng.randint(2, 10) + delta = rng.randint(1, 10) + + encoder_max_value = max_value + delta + unary_max_value = encoder_max_value - 1 + enc = UnaryEncoder(max_value=encoder_max_value) + + X = _generate_random_features_matrix(size, n_features, max_value) + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (size, unary_max_value * n_features)) + assert_array_equal( + enc.feature_indices_, + np.arange(0, (unary_max_value * n_features) + 1, unary_max_value) + ) + assert_array_equal( + enc.max_value_, + np.array([encoder_max_value] * n_features) + ) + + +def test_unary_encoder_max_value_array(): + # Test UnaryEncoder's max_value parameter when set as an array. + rng = np.random.RandomState(6) + max_value = rng.randint(2, 10) + size = rng.randint(1, 10) + n_features = rng.randint(2, 10) + delta = rng.randint(1, 10) + + # Test ideal case is working fine + X = _generate_random_features_matrix(size, n_features, max_value) + max_value_array = list(np.max(X, axis=0) + 1) + enc = UnaryEncoder(max_value=max_value_array) + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (size, sum(max_value_array) - n_features)) + assert_array_equal( + enc.feature_indices_, + np.cumsum(np.array([1] + max_value_array) - 1) + ) + assert_array_equal( + enc.max_value_, + np.array(max_value_array) + ) + + # Test that fit_transform raises error when len(max_value) != n_features + max_value_array = rng.randint(2, 10, n_features + delta) + enc = UnaryEncoder(max_value=max_value_array) + X = _generate_random_features_matrix(size, n_features, max_value) + assert_raises(ValueError, enc.fit_transform, X) + + # Test that fit_transform raises error when len(max_value) != n_features + enc = UnaryEncoder(max_value=[]) + X = _generate_random_features_matrix(size, n_features, max_value) + assert_raises(ValueError, enc.fit_transform, X) + + +@pytest.mark.parametrize('sparse_', (True, False)) +@pytest.mark.parametrize('X', ( + [[0], [0]], # only one category (transformed into []) + [[1], [1]], # only one category but implicitely 2 + [[1, 0], [1, 1], [0, 1], [0, 2]] # multiple categories +)) +def test_unary_encoder_inverse_transform(sparse_, X): + enc = UnaryEncoder(sparse=sparse_) + assert_array_equal(X, enc.inverse_transform(enc.fit_transform(X))) + + +def test_unary_encoder_inverse_transform_input(): + X = [[1, 0], # will be transformed into 1 + 2 = 3 columns + [1, 1], + [0, 1], + [0, 2]] + enc = UnaryEncoder().fit(X) + bad_X_tr = [[1, 1, 1, 0]] # 4 columns + assert_raises_regex( + ValueError, + "Shape of the passed X data is not correct. Expected 3 columns, got 4", + enc.inverse_transform, bad_X_tr + ) + + # Also check that inverse_transform still works on non-binary matrices. + # Non-zero values are treated as ones. + X_inv = enc.inverse_transform([[4, 100, 0]]) # Treated as [1, 1, 0]) + assert_array_equal(X_inv, [[1, 1]]) + X_inv = enc.inverse_transform([[4, 100, 123]]) # Treated as [1, 1, 1]) + assert_array_equal(X_inv, [[1, 2]]) + X_inv = enc.inverse_transform([[0, 1, 123]]) # Treated as [0, 1, 1]) + assert_array_equal(X_inv, [[0, 2]]) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 699026b9e47ee..1772aa87714f9 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -595,7 +595,7 @@ def uninstall_mldata_mock(): 'TfidfVectorizer', 'IsotonicRegression', 'OneHotEncoder', 'RandomTreesEmbedding', 'OrdinalEncoder', 'FeatureHasher', 'DummyClassifier', 'DummyRegressor', - 'TruncatedSVD', 'PolynomialFeatures', + 'TruncatedSVD', 'PolynomialFeatures', 'UnaryEncoder', 'GaussianRandomProjectionHash', 'HashingVectorizer', 'CheckingClassifier', 'PatchExtractor', 'CountVectorizer', # GradientBoosting base estimators, maybe should