diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 8e43f23af5d63..89896065d19bf 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1201,6 +1201,7 @@ Model validation preprocessing.Normalizer preprocessing.OneHotEncoder preprocessing.CategoricalEncoder + preprocessing.UnaryEncoder preprocessing.PolynomialFeatures preprocessing.PowerTransformer preprocessing.QuantileTransformer diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 9aac20dca9664..688eb0158e475 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -589,6 +589,66 @@ columns for this feature will be all zeros See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as scalars. +.. _preprocessing_ordinal_features: + +Encoding ordinal features +============================= +Often categorical features have a clear ordering. For example a person could +have features + +* ``["short", "tall"]`` +* ``["low income", "medium income", "high income"]`` +* ``["elementary school graduate", "high school graduate", "some college", + "college graduate"]`` + +Even though these features can be ordered, we shouldn't necessarily assign +scores to them, as the difference between categories one and two is not the +same as the difference between categories two and three. + +One possibility to convert these ordinal features to features that can be used +with scikit-learn estimators is to use a unary encoding, which is +implemented in :class:`UnaryEncoder`. This estimator transforms each +ordinal feature with ``m`` possible values into ``m - 1`` binary features, +where the ith feature is active if x > i (for i = 0, ... k - 1). + +.. note:: + + This encoding is likely to help when used with linear models and + kernel-based models like SVMs with the standard kernels. On the other hand, this + transformation is unlikely to help when using with tree-based models, + since those already work on the basis of a particular feature value being + < or > than a threshold, unlike linear and kernel-based models. + +Continuing the example above:: + + >>> enc = preprocessing.UnaryEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', + n_values='auto', ordinal_features='all', sparse=False) + >>> enc.transform([[0, 1, 1]]) + array([[ 0., 1., 0., 1., 0., 0.]]) + +By default, how many values each feature can take is inferred automatically +from the dataset. It is possible to specify this explicitly using the parameter +``n_values``. +* There are two genders, three possible continents and four web browsers in our + dataset. +* Then we fit the estimator, and transform a data point. +* In the result, the first number encodes the height, the next two numbers the + income level, and the next set of three numbers the education level. + +Note that, if there is a possibilty that the training data might have missing +categorical features, one has to explicitly set ``n_values``. For example,:: + + >>> enc = preprocessing.UnaryEncoder(n_values=[2, 3, 4]) + >>> # Note that there are missing categorical values for the 2nd and 3rd + >>> # features + >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', + n_values=[2, 3, 4], ordinal_features='all', sparse=False) + >>> enc.transform([[1, 1, 2]]) + array([[ 1., 1., 0., 1., 1., 0.]]) + .. _imputation: Imputation of missing values diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 0563dd018881f..41cec92b4b493 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -26,6 +26,7 @@ from .data import PowerTransformer from .data import CategoricalEncoder from .data import PolynomialFeatures +from .data import UnaryEncoder from .label import label_binarize from .label import LabelBinarizer @@ -65,4 +66,5 @@ 'label_binarize', 'quantile_transform', 'power_transform', + 'UnaryEncoder' ] diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index e90ceaf7509b6..7d539f798816f 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -62,6 +62,7 @@ 'minmax_scale', 'quantile_transform', 'power_transform', + 'UnaryEncoder' ] @@ -1957,6 +1958,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): matrix indicating the presence of a class label. sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 and n_classes-1. + sklearn.preprocessing.UnaryEncoder: encodes ordinal integer features + using a unary scheme. """ def __init__(self, n_values="auto", categorical_features="all", dtype=np.float64, sparse=True, handle_unknown='error'): @@ -2064,8 +2067,8 @@ def _transform(self, X): mask = (X < self.n_values_).ravel() if np.any(~mask): if self.handle_unknown not in ['error', 'ignore']: - raise ValueError("handle_unknown should be either error or " - "unknown got %s" % self.handle_unknown) + raise ValueError("handle_unknown should be either 'error' or " + "'ignore' got %s" % self.handle_unknown) if self.handle_unknown == 'error': raise ValueError("unknown categorical feature present %s " "during transform." % X.ravel()[~mask]) @@ -3147,3 +3150,216 @@ def inverse_transform(self, X): X_tr[mask, idx] = None return X_tr + + +class UnaryEncoder(BaseEstimator, TransformerMixin): + """Encode ordinal integer features using a unary scheme. + + The input to this transformer should be a matrix of non-negative integers, + denoting the values taken on by ordinal (discrete) features. The output + will be a matrix where each column corresponds to one possible value of + one feature. It is assumed that input features take on values in the range + 0 to (n_values - 1). + + This encoding is needed for feeding ordinal features to many scikit-learn + estimators, notably linear models and kernel-based models like SVMs with + the standard kernels. + This transformation is unlikely to help when using with tree-based models, + since those already work on the basis of a particular feature value being + < or > than a threshold, unlike linear and kernel-based models. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_values : 'auto', int or array of ints + Number of values per feature. + + - 'auto' : determine value range from training data. + - int : number of ordinal values per feature. + Each feature value should be in ``range(n_values)`` + - array : ``n_values[i]`` is the number of ordinal values in + ``X[:, i]``. Each feature value should be + in ``range(n_values[i])`` + + ordinal_features : "all" or array of indices or mask + Specify what features are treated as ordinal. + + - 'all' (default): All features are treated as ordinal. + - array of indices: Array of ordinal feature indices. + - mask: Array of length n_features and with dtype=bool. + + Non-ordinal features are always stacked to the right of the matrix. + + dtype : number type, default=np.float + Desired dtype of output. + + sparse : boolean, default=False + Will return sparse matrix if set True else will return an array. + + handle_greater : str, 'warn' or 'error' or 'clip' + Whether to raise an error or clip or warn if an + ordinal feature >= n_values is passed in. + + - 'warn' (default): same as clip but with warning. + - 'error': raise error if feature >= n_values is passed in. + - 'clip': all the feature values >= n_values are clipped to + (n_values-1) during transform. + + Attributes + ---------- + feature_indices_ : array of shape (n_features,) + Indices to feature ranges. + Feature ``i`` in the original data is mapped to features + from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` + + n_values_ : array of shape (n_features,) + Maximum number of values per feature. + + Examples + -------- + Given a dataset with three features and four samples, we let the encoder + find the maximum value per feature and transform the data to a binary + unary encoding. + + >>> from sklearn.preprocessing import UnaryEncoder + >>> enc = UnaryEncoder() + >>> enc.fit([[0, 0, 3], + ... [1, 1, 0], + ... [0, 2, 1], + ... [1, 0, 2]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', + n_values='auto', ordinal_features='all', sparse=False) + >>> enc.n_values_ + array([2, 3, 4]) + >>> enc.feature_indices_ + array([0, 1, 3, 6]) + >>> enc.transform([[0, 1, 2]]) + array([[ 0., 1., 0., 1., 1., 0.]]) + + See also + -------- + sklearn.preprocessing.OneHotEncoder: encodes categorical integer features + using a one-hot aka one-of-K scheme. + """ + def __init__(self, n_values="auto", ordinal_features="all", + dtype=np.float64, sparse=False, handle_greater='warn'): + self.n_values = n_values + self.ordinal_features = ordinal_features + self.dtype = dtype + self.sparse = sparse + self.handle_greater = handle_greater + + def fit(self, X, y=None): + """Fit UnaryEncoder to X. + + Parameters + ---------- + X : array-like, shape [n_samples, n_feature] + Input array of type int. + All feature values should be non-negative otherwise will raise a + ValueError. + """ + _transform_selected(X, self._fit, self.ordinal_features, copy=True) + return self + + def _fit(self, X): + """Assumes X contains only ordinal features.""" + X = check_array(X, dtype=np.int) + if self.handle_greater not in ['warn', 'error', 'clip']: + raise ValueError("handle_greater should be either 'warn', 'error' " + "or 'clip' got %s" % self.handle_greater) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + + if (isinstance(self.n_values, six.string_types) and + self.n_values == 'auto'): + n_values = np.max(X, axis=0) + 1 + elif isinstance(self.n_values, numbers.Integral): + n_values = np.empty(n_features, dtype=np.int) + n_values.fill(self.n_values) + else: + try: + n_values = np.asarray(self.n_values, dtype=int) + except (ValueError, TypeError): + raise TypeError("Wrong type for parameter `n_values`. Expected" + " 'auto', int or array of ints, got %r" + % self.n_values) + if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if n_values is an array," + " it has to be of shape (n_features,).") + + self.n_values_ = n_values + n_values = np.hstack([[0], n_values - 1]) + indices = np.cumsum(n_values) + self.feature_indices_ = indices + + mask = (X >= self.n_values_).ravel() + if np.any(mask): + if self.handle_greater == 'error': + raise ValueError("handle_greater='error' but found %d feature" + " values which exceeds n_values." + % np.count_nonzero(mask)) + + return X + + def _transform(self, X): + """Assumes X contains only ordinal features.""" + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + + indices = self.feature_indices_ + if n_features != indices.shape[0] - 1: + raise ValueError("X has different shape than during fitting." + " Expected %d, got %d." + % (indices.shape[0] - 1, n_features)) + + # We clip those ordinal features of X that are greater than n_values_ + # using mask if self.handle_greater is "clip". + # This means, the row_indices and col_indices corresponding to the + # greater ordinal feature are all filled with ones. + mask = (X >= self.n_values_).ravel() + if np.any(mask): + if self.handle_greater == 'warn': + warnings.warn("Found %d feature values which exceeds " + "n_values during transform, clipping them." + % np.count_nonzero(mask)) + elif self.handle_greater == 'error': + raise ValueError("handle_greater='error' but found %d feature" + " values which exceeds n_values during " + "transform." % np.count_nonzero(mask)) + + X_ceil = np.where(mask.reshape(X.shape), self.n_values_ - 1, X) + column_start = np.tile(indices[:-1], n_samples) + column_end = (indices[:-1] + X_ceil).ravel() + column_indices = np.hstack([np.arange(s, e) for s, e + in zip(column_start, column_end)]) + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + X_ceil.sum(axis=1)) + data = np.ones(X_ceil.ravel().sum()) + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + + return out if self.sparse else out.toarray() + + def transform(self, X): + """Transform X using Ordinal encoding. + + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + Input array of type int. + All feature values should be non-negative otherwise will raise a + ValueError. + + Returns + ------- + X_out : sparse matrix if sparse=True else a 2-d array, dtype=int + Transformed input. + """ + return _transform_selected(X, self._transform, + self.ordinal_features, copy=True) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index abb17142efc77..905165f4ffd36 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -42,6 +42,7 @@ from sklearn.preprocessing.data import normalize from sklearn.preprocessing.data import OneHotEncoder from sklearn.preprocessing.data import CategoricalEncoder +from sklearn.preprocessing.data import UnaryEncoder from sklearn.preprocessing.data import StandardScaler from sklearn.preprocessing.data import scale from sklearn.preprocessing.data import MinMaxScaler @@ -2228,6 +2229,244 @@ def test_quantile_transform_valid_axis(): ". Got axis=2", quantile_transform, X.T, axis=2) +def _generate_random_features_matrix(n_samples=10, n_features=3, + n_values_max=3): + rng = np.random.RandomState(6) + X = rng.randint(n_values_max, size=(n_samples, n_features)) + return X + + +def test_unary_encoder(): + X = np.arange(5).reshape(-1, 1) + enc = UnaryEncoder(5) + Xt = enc.fit_transform(X) + assert_array_equal(Xt, [[0, 0, 0, 0], # 0 + [1, 0, 0, 0], # 1 + [1, 1, 0, 0], # 2 + [1, 1, 1, 0], # 3 + [1, 1, 1, 1]]) # 4 + Xt2 = enc.transform(X) + assert_array_equal(Xt2, Xt) + + +def test_unary_encoder_stack(): + # multiple input features stack to same output + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) + + encoder = UnaryEncoder(n_values, sparse=False) + X_multi = _generate_random_features_matrix(size, n_features, n_values) + X_multi_t = encoder.fit_transform(X_multi) + assert_equal(X_multi_t.shape, (size, n_features * (n_values - 1))) + + expected = np.hstack([encoder.fit_transform(X_multi[:, i:(i + 1)]) + for i in range(X_multi.shape[1])]) + assert_array_equal(expected, X_multi_t) + + +def test_unary_encoder_dense_sparse(): + # test dense output in comparison to sparse results. + n_values = rng.randint(1, 10) + size = rng.randint(1, 10) + + sparse_encoder = UnaryEncoder(n_values, sparse=True) + dense_encoder = UnaryEncoder(n_values) + + X = _generate_random_features_matrix(size, n_features, n_values) + X_trans_sparse = sparse_encoder.fit_transform(X) + X_trans_dense = dense_encoder.fit_transform(X) + + assert_array_equal(X_trans_sparse.toarray(), X_trans_dense) + + +def test_unary_encoder_handle_greater(): + X = np.array([[0, 2, 1], [1, 0, 3], [2, 0, 2]]) + y = np.array([[4, 1, 1]]) + + # Test that encoder raises error for greater features during transform. + encoder = UnaryEncoder(handle_greater='error') + encoder.fit(X) + assert_raises(ValueError, encoder.transform, y) + + encoder = UnaryEncoder(handle_greater='error') + assert_array_equal(encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + # Test that encoder raises error for greater features during fit when + # n_values is explicitly set. + encoder = UnaryEncoder(handle_greater='error', n_values=[2, 3, 4]) + assert_raises(ValueError, encoder.fit, X) + + encoder = UnaryEncoder(handle_greater='error', n_values=[2, 3, 4]) + assert_raises(ValueError, encoder.fit_transform, X) + + encoder = UnaryEncoder(handle_greater='error', n_values=[5, 2, 2]) + encoder.fit(y) + assert_array_equal(encoder.transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + encoder = UnaryEncoder(handle_greater='error', n_values=[5, 2, 2]) + assert_array_equal(encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + # Test the clip option. + encoder = UnaryEncoder(handle_greater='clip') + encoder.fit(X) + assert_array_equal( + encoder.transform(y), + np.array([[1., 1., 1., 0., 1., 0., 0.]])) + + encoder = UnaryEncoder(handle_greater='clip', n_values=[3, 2, 2]) + assert_array_equal( + encoder.fit_transform(y), + np.array([[1., 1., 1., 1.]])) + + # Test the warn option. + encoder = UnaryEncoder() + encoder.fit(X) + w = ('Found 1 feature values which exceeds n_values during transform, ' + 'clipping them.') + y_transformed = assert_warns_message(UserWarning, w, encoder.transform, y) + assert_array_equal( + y_transformed, + np.array([[1., 1., 1., 0., 1., 0., 0.]])) + + encoder = UnaryEncoder(n_values=[3, 2, 2]) + y_transformed = assert_warns_message(UserWarning, w, + encoder.fit_transform, y) + assert_array_equal( + y_transformed, + np.array([[1., 1., 1., 1.]])) + + encoder = UnaryEncoder(n_values=[5, 2, 2]) + assert_array_equal( + encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + # Raise error if handle_greater is neither clip nor error. + encoder = UnaryEncoder(handle_greater='42') + assert_raises(ValueError, encoder.fit, y) + + +def test_unary_encoder_errors(): + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) + delta = rng.randint(1, 10) + + encoder = UnaryEncoder(n_values) + X = _generate_random_features_matrix(size, n_features, n_values) + encoder.fit(X) + + # test that an error is raised when different shape + larger_n_features = n_features + delta + X_too_large = _generate_random_features_matrix(size, larger_n_features, + n_values) + assert_raises(ValueError, encoder.transform, X_too_large) + error_msg = ("X has different shape than during fitting." + " Expected {}, got {}.".format(n_features, larger_n_features)) + assert_raises_regex(ValueError, error_msg, encoder.transform, X_too_large) + + # test that an error is raised when out of bounds + encoder = UnaryEncoder(n_values, handle_greater='error') + X = _generate_random_features_matrix(size, n_features, n_values) + encoder.fit(X) + X[0][0] = n_values + delta + X_out_of_bounds = X + assert_raises(ValueError, encoder.transform, X_out_of_bounds) + error_msg = ("handle_greater='error' but found 1 feature values which " + "exceeds n_values during transform.") + assert_raises_regex(ValueError, error_msg, encoder.transform, + X_out_of_bounds) + + # test exception on wrong init param + assert_raises(TypeError, UnaryEncoder(n_values=np.int).fit, X) + + # test negative input to fit + encoder = UnaryEncoder() + assert_raises(ValueError, encoder.fit, [[0], [-1]]) + + # test negative input to transform + encoder.fit([[0], [1]]) + assert_raises(ValueError, encoder.transform, [[0], [-1]]) + + +def test_unary_encoder_edge_cases(): + EDGE_CASES = [ + ( + [[0], [1], [2]], + np.array([[0, 0], [1, 0], [1, 1]]), + ), + ( + [[0], [0], [1]], + np.array([[0], [0], [1]]), + ), + ( + [[0, 0], [0, 0], [0, 1]], + np.array([[0], [0], [1]]), + ), + ] + + for input_matrix, expected_matrix in EDGE_CASES: + transformed = UnaryEncoder().fit_transform(input_matrix) + assert_array_equal(transformed, expected_matrix) + + +def test_unary_encoder_n_values_int(): + # Test UnaryEncoder's n_values parameter when set as an int. + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) + delta = rng.randint(1, 10) + + encoder_n_values = n_values + delta + unary_n_values = encoder_n_values - 1 + enc = UnaryEncoder(n_values=encoder_n_values) + + X = _generate_random_features_matrix(size, n_features, n_values) + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (size, unary_n_values * n_features)) + assert_array_equal( + enc.feature_indices_, + np.arange(0, (unary_n_values * n_features) + 1, unary_n_values) + ) + assert_array_equal( + enc.n_values_, + np.array([encoder_n_values] * n_features) + ) + + +def test_unary_encoder_n_values_array(): + # Test UnaryEncoder's n_values parameter when set as an array. + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) + delta = rng.randint(1, 10) + + # Test ideal case is working fine + X = _generate_random_features_matrix(size, n_features, n_values) + n_values_array = list(np.max(X, axis=0) + 1) + enc = UnaryEncoder(n_values=n_values_array) + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (size, sum(n_values_array) - n_features)) + assert_array_equal( + enc.feature_indices_, + np.cumsum(np.array([1] + n_values_array) - 1) + ) + assert_array_equal( + enc.n_values_, + np.array(n_values_array) + ) + + # Test that fit_transform raises error when len(n_values) != n_features + n_values_array = rng.randint(2, 10, n_features + delta) + enc = UnaryEncoder(n_values=n_values_array) + X = _generate_random_features_matrix(size, n_features, n_values) + assert_raises(ValueError, enc.fit_transform, X) + + # Test that fit_transform raises error when len(n_values) != n_features + enc = UnaryEncoder(n_values=[]) + X = _generate_random_features_matrix(size, n_features, n_values) + assert_raises(ValueError, enc.fit_transform, X) + + def test_power_transformer_notfitted(): pt = PowerTransformer(method='box-cox') X = np.abs(X_1col) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 6e2d9d5902add..c25bc6f59269c 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -532,7 +532,7 @@ def uninstall_mldata_mock(): 'TfidfVectorizer', 'IsotonicRegression', 'OneHotEncoder', 'RandomTreesEmbedding', 'CategoricalEncoder', 'FeatureHasher', 'DummyClassifier', 'DummyRegressor', - 'TruncatedSVD', 'PolynomialFeatures', + 'TruncatedSVD', 'PolynomialFeatures', 'UnaryEncoder', 'GaussianRandomProjectionHash', 'HashingVectorizer', 'CheckingClassifier', 'PatchExtractor', 'CountVectorizer', # GradientBoosting base estimators, maybe should