diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 84f8097cbbe9d..9dad582f08000 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1415,6 +1415,7 @@ details. preprocessing.QuantileTransformer preprocessing.RobustScaler preprocessing.StandardScaler + preprocessing.TargetRegressorEncoder .. autosummary:: :toctree: generated/ diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 801d9a98ed1f4..af31bb845c44a 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -466,6 +466,9 @@ Such features can be efficiently coded as integers, for instance ``[0, 1, 3]`` while ``["female", "from Asia", "uses Chrome"]`` would be ``[1, 2, 1]``. +OrdinalEncoder +-------------- + To convert categorical features to such integer codes, we can use the :class:`OrdinalEncoder`. This estimator transforms each categorical feature to one new feature of integers (0 to n_categories - 1):: @@ -482,6 +485,9 @@ scikit-learn estimators, as these expect continuous input, and would interpret the categories as being ordered, which is often not desired (i.e. the set of browsers was ordered arbitrarily). +OneHotEncoder +------------- + Another possibility to convert categorical features to features that can be used with scikit-learn estimators is to use a one-of-K, also known as one-hot or dummy encoding. @@ -611,6 +617,68 @@ separate categories:: See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as scalars. +.. _target_regressor_encoder: + +Target Regressor Encoder +------------------------ + +The :class:`~sklearn.preprocessing.TargetRegressorEncoder` uses target +statistics conditioned on the categorical feature for encoding [FLO]_ [MIC]_. +This encoding scheme is useful with categorical features with high cardinality, +where one hot encoding would inflate the feature space making it more expensive +for a downstream model to process. A classical example of high cardinality +categories are location based such as zip code or region. The +:class:`~sklearn.preprocessing.TargetRegressorEncoder` implementation uses +partial-pooling estimates from a multilevel model to encode categorical +features. Specifically, we use a multilevel generalized linear model (GLM) +approach for estimating the partial-pooling values as shown in Chapter 12.2 of +[GEL]_: + +.. math:: + + \alpha_c\approx\frac{\frac{n_c}{\sigma_c^2}\mu_c+ + \frac{1}{\sigma^2}\mu}{\frac{n_c}{\sigma_c^2} + \frac{1}{\sigma^2}} + +where :math:`\alpha_c` is the encoding for category :math:`c`, :math:`n_c` is +the category count, :math:`\mu_c` is the target mean conditioned on :math:`c`, +:math:`\sigma_c^2` is the target variance conditioned on :math:`c`, :math:`\mu` +is the overall target mean, and :math:`\sigma^2` is the overall target +variance. The averages from categories with smaller counts carry less +information and the multilevel estimates are pulled closer to the overall mean. +For categories with larger counts the multilevel estimates are pulled closer to +the mean conditioned on the category. + +For example, the following toy dataset contains two categories where the +`'cat'` category appears 3 times and the `'dog'` category appears 50 times. The +encoding for `'cat'` is pulled toward the overall mean of `53` when compared to +`'dog'` because the `'cat'` category appears less frequently:: + + >>> from sklearn.preprocessing import TargetRegressorEncoder + >>> X = np.array([['cat'] * 3 + ['dog'] * 50]).T + >>> y = np.array([0, 30, 60] + [60, 62] * 25) + >>> enc = TargetRegressorEncoder().fit(X, y) + >>> enc.transform([['cat'], ['dog']]) + array([[50...], + [60...]]) + >>> enc.encoding_mean_ + 59... + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_preprocessing_plot_target_regressor_encoder.py` + +.. topic:: References + + .. [GEL] Gelman A, Hill J. Data Analysis using Regression and + Multilevel/Hierarchical Models. Cambridge University Press, 2007 + .. [FLO] Pargent, Florian. `A Benchmark Experiment on How to Encode + Categorical Features in Predictive Modeling ` + OSF, 24 Mar. 2019. + .. [MIC] Micci-Barreca D (2001) `A preprocessing scheme for + high-cardinality categorical attributes in classification and prediction + problems `, ACM SIGKDD + Explorations Newsletter, 3(1), 27-32. + .. _preprocessing_discretization: Discretization diff --git a/examples/preprocessing/plot_target_regressor_encoder.py b/examples/preprocessing/plot_target_regressor_encoder.py new file mode 100644 index 0000000000000..8182172bbc5c8 --- /dev/null +++ b/examples/preprocessing/plot_target_regressor_encoder.py @@ -0,0 +1,138 @@ +""" +============================= +Target Encoder for Regressors +============================= + +.. currentmodule:: sklearn.preprocessing + +The :class:`TargetRegressorEncoder` uses target statistics conditioned on +the categorical features for encoding. In this example, we will compare +:class:`TargetRegressorEncoder`, :class:`OrdinalEncoder`, and dropping the +category on a wine review dataset. +""" + +# %% +# Loading Data from OpenML +# ======================== +# First, we load the wine reviews dataset, where the target is the points given +# be a reviewer: +import warnings +from sklearn.datasets import fetch_openml + +with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UserWarning) + wine_reviews = fetch_openml(data_id=42074, as_frame=True) + +df = wine_reviews.frame +df.head() + +# %% +# For this example, we use the following subset of numerical and categorical +# features in the data. The categorical features have a cardinality ranging +# from 18 to 14810: +numerical_features = ['price'] +categorical_features = ['country', 'province', 'region_1', 'region_2', + 'variety', 'winery'] + +X = df[numerical_features + categorical_features] +y = df['points'] +X.nunique().sort_values(ascending=False) + +# %% +# We split the dataset into a training and test set: +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + +print(f"Samples in training set: {len(X_train)}\n" + f"Samples in test set: {len(X_test)}") + +# %% +# Building and Training Pipelines with Different Encoders +# ======================================================= +# Dropping the categorical features +# --------------------------------- +# As a basline, we construct a pipeline where the categorical features are +# dropped. +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn import set_config +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.impute import SimpleImputer + +set_config(display='diagram') # Show HTML representation of pipeline + +prep = ColumnTransformer([ + ('num', SimpleImputer(strategy='median'), numerical_features), + ('cat', 'drop', categorical_features) +]) + +reg_drop_cats = Pipeline([ + ('prep', prep), ('hist', HistGradientBoostingRegressor()) +]) +reg_drop_cats + +# %% +# Here we train and use the root mean squared error to evalute the baseline +# model: +from sklearn.metrics import mean_squared_error + +reg_drop_cats.fit(X_train, y_train) +reg_drop_cats_rmse = mean_squared_error( + y_test, reg_drop_cats.predict(X_test) +) +print(f"RMSE for dropping categorical features: {reg_drop_cats_rmse:.4}") + +# %% +# Using the OrdinalEncoder +# ------------------------ +# Since the categorical features have missing values, we impute the feature +# with `'sk_missing'` before passing it to the :class:`OrdinalEncoder`. +# The `categories` parameter is constructed such that there are not unknown +# values during test time +from sklearn.preprocessing import OrdinalEncoder + +categories = [ + X[feat].fillna("sk_missing").unique() for feat in categorical_features +] + +cat_prep = Pipeline([ + ('imputer', SimpleImputer(strategy='constant', missing_values=None, + fill_value='sk_missing')), + ('encoder', OrdinalEncoder(categories=categories)) +]) + + +# %% +# We modify the original pipeline to use the ordinal categorical preprocessing: +reg_ordinal = reg_drop_cats.set_params(prep__cat=cat_prep) +reg_ordinal + +# %% +# When we include the categorical features through ordinal encoding the RMSE +# improves: +reg_ordinal.fit(X_train, y_train) +reg_ordinal_rmse = mean_squared_error( + y_test, reg_ordinal.predict(X_test), squared=False +) +print(f"RMSE with ordinal encoding: {reg_ordinal_rmse:.4}") + +# %% +# Using the TargetEncoder +# ----------------------- +# Finally, we replace the ordinal encoder with the +# :class:`TargetRegressorEncoder`: +from sklearn.preprocessing import TargetRegressorEncoder + +reg_target = reg_ordinal.set_params( + prep__cat__encoder=TargetRegressorEncoder()) +reg_target + +# %% +# The :class:`TargetRegressorEncoder` further improves the RMSE: +reg_target.fit(X_train, y_train) +reg_target_rmse = mean_squared_error( + y_test, reg_target.predict(X_test), squared=False +) +print(f"RMSE with target encoding: {reg_target_rmse:.4}") diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index d048b30e1f3d0..03cdb57ba0694 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -27,6 +27,7 @@ from ._encoders import OneHotEncoder from ._encoders import OrdinalEncoder +from ._target_encoder import TargetRegressorEncoder from ._label import label_binarize from ._label import LabelBinarizer @@ -43,6 +44,7 @@ 'KernelCenterer', 'LabelBinarizer', 'LabelEncoder', + 'TargetRegressorEncoder', 'MultiLabelBinarizer', 'MinMaxScaler', 'MaxAbsScaler', diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 6d59cb5a0d858..4b05ffec7bbb7 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -11,7 +11,7 @@ from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args -from ..utils._encode import _encode, _check_unknown, _unique +from ..utils._encode import _encode, _check_unknown, _unique, _get_counts __all__ = [ @@ -73,7 +73,8 @@ def _get_feature(self, X, feature_idx): # numpy arrays, sparse arrays return X[:, feature_idx] - def _fit(self, X, handle_unknown='error', force_all_finite=True): + def _fit(self, X, handle_unknown='error', force_all_finite=True, + return_counts=False): X_list, n_samples, n_features = self._check_X( X, force_all_finite=force_all_finite) @@ -83,11 +84,17 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True): " it has to be of shape (n_features,).") self.categories_ = [] + category_counts = [] for i in range(n_features): Xi = X_list[i] if self.categories == 'auto': - cats = _unique(Xi) + result = _unique(Xi, return_counts=return_counts) + if return_counts: + cats, counts = result + category_counts.append(counts) + else: + cats = result else: cats = np.array(self.categories[i], dtype=Xi.dtype) if Xi.dtype.kind not in 'OU': @@ -107,8 +114,14 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True): msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) + if return_counts: + category_counts.append(_get_counts(Xi, cats)) + self.categories_.append(cats) + return {'category_counts': category_counts, + 'n_samples': n_samples} + def _transform(self, X, handle_unknown='error', force_all_finite=True): X_list, n_samples, n_features = self._check_X( X, force_all_finite=force_all_finite) diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py new file mode 100644 index 0000000000000..7fefab56f68bd --- /dev/null +++ b/sklearn/preprocessing/_target_encoder.py @@ -0,0 +1,168 @@ +import numpy as np + +from ..utils.validation import check_is_fitted +from ..utils.validation import column_or_1d +from ._encoders import _BaseEncoder + + +class TargetRegressorEncoder(_BaseEncoder): + """Target Encoder for Regression Targets. + + Each category is encoded based on its effect on the target variable. The + encoding scheme takes a weighted average estimated by a multilevel + linear model. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + categories : 'auto' or a list of array-like, default='auto' + Categories (unique values) per feature: + + - 'auto' : Determine categories automatically from the training data. + - list : `categories[i]` holds the categories expected in the ith + column. The passed categories should not mix strings and numeric + values within a single feature, and should be sorted in case of + numeric values in ascending order. + + The used categories can be found in the `categories_` attribute. + + Attributes + ---------- + encodings_ : list of shape (n_features,) of ndarray + For feature `i`, `encodings_[i]` is the encoding matching the + categories listed in `categories_[i]`. + + categories_ : list of shape (n_features,) of ndarray + The categories of each feature determined during fitting + (in order of the features in X and corresponding with the output + of :meth:`transform`). + + encoding_mean_ : float + The overall mean of the target. + + See Also + -------- + sklearn.preprocessing.OrdinalEncoder : Performs an ordinal (integer) + encoding of the categorical features. + sklearn.preprocessing.OneHotEncoder : Performs a one-hot encoding of + categorical features. + """ + def __init__(self, categories='auto'): + self.categories = categories + + def fit(self, X, y): + """Fit the TargetRegressorEncoder to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : object + """ + self._fit(X, y) + return self + + def transform(self, X, y=None): + """Encodes X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to encode. + + Returns + ------- + X_out : ndarray of shape (n_samples, n_features) + Transformed input. + """ + check_is_fitted(self) + X_int, X_known = self._transform(X, handle_unknown='ignore', + force_all_finite='allow-nan') + return self._transform_X_int(X_int, X_known) + + def fit_transform(self, X, y): + """Fit the encoder and encodes `X`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to encode. + + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + X_out : ndarray of shape (n_samples, n_features) + Transformed input. + """ + X_int, X_known = self._fit(X, y) + return self._transform_X_int(X_int, X_known) + + def _transform_X_int(self, X_int, X_known): + """Transform integer encoded X. `X_int` and `X_known` are returned + by `_transform`. + """ + out = np.empty_like(X_int, dtype=float) + for i, cat_encoding in enumerate(self.encodings_): + np.take(cat_encoding, X_int[:, i], out=out[:, i]) + out[~X_known[:, i], i] = self.encoding_mean_ + return out + + def _fit(self, X, y): + """Fit the encoder""" + y = column_or_1d(y, warn=True) + fit_results = super()._fit(X, handle_unknown='ignore', + return_counts=True, + force_all_finite='allow-nan') + X_int, X_known = self._transform(X, handle_unknown='ignore', + force_all_finite='allow-nan') + # Makes sure unknown categories are not used fot fitting + X_int[~X_known] = -1 + self.encoding_mean_ = y_mean = np.mean(y) + + # y is constant the encoding will be the constant + if np.ptp(y) == 0.0: + self.encodings_ = [ + np.full(len(cat), fill_value=y_mean, dtype=float) + for cat in self.categories_] + return X_int, X_known + + y_variance = np.var(y) + + n_samples, n_features = X_int.shape + # temporary location to store masks when encoding each category + tmp_mask = np.zeros(n_samples, dtype=bool) + + cat_encodings = [] + category_counts = fit_results['category_counts'] + + for i in range(n_features): + n_cats = len(self.categories_[i]) + cat_means = np.zeros(n_cats, dtype=float) + cat_var_ratio = np.ones(n_cats, dtype=float) + + for encoding in range(n_cats): + np.equal(X_int[:, i], encoding, out=tmp_mask) + y_tmp = y[tmp_mask] + if y_tmp.size: + cat_means[encoding] = np.mean(y_tmp) + cat_var_ratio[encoding] = np.var(y_tmp) + + # partial-pooling estimates + cat_counts = category_counts[i] + cat_var_ratio /= y_variance + + cat_encoded = cat_counts * cat_means + cat_var_ratio * y_mean + cat_encoded /= cat_counts + cat_var_ratio + cat_encodings.append(cat_encoded) + + self.encodings_ = cat_encodings + return X_int, X_known diff --git a/sklearn/preprocessing/tests/test_target_encoder.py b/sklearn/preprocessing/tests/test_target_encoder.py new file mode 100644 index 0000000000000..e4493875e6783 --- /dev/null +++ b/sklearn/preprocessing/tests/test_target_encoder.py @@ -0,0 +1,180 @@ +import pytest +import numpy as np +from numpy.testing import assert_allclose + +from sklearn.preprocessing import TargetRegressorEncoder + + +@pytest.mark.parametrize('categories', [ + np.array([0, 1, 2], dtype=int), + np.array(['cat', 'dog', 'snake'], dtype=object) +]) +@pytest.mark.parametrize('seed', range(3)) +def test_regression(categories, seed): + # checks impact encoder for regression + + X_int = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=int).T + X_input = categories[X_int] + n_samples = X_input.shape[0] + rng = np.random.RandomState(seed) + y = rng.uniform(low=-10, high=20, size=n_samples) + + # multilevel partial pooling + y_sections = [y[:20], y[20:50], y[50:]] + cat_means = np.array([np.mean(sect) for sect in y_sections]) + cat_variance = np.array([np.var(sect) for sect in y_sections]) + cat_counts = np.array([20, 30, 40]) + + y_variance = np.var(y) + y_mean = np.mean(y) + + # multilevel partial pooling directly + cat_encoded = ((cat_counts * cat_means / cat_variance + + y_mean / y_variance)) + cat_encoded /= (cat_counts / cat_variance + 1 / y_variance) + + # shuffle + shuffled_idx = rng.permutation(n_samples) + X_input = X_input[shuffled_idx] + y = y[shuffled_idx] + + enc = TargetRegressorEncoder().fit(X_input, y) + + assert len(enc.encodings_) == 1 + assert enc.encoding_mean_ == pytest.approx(y_mean) + assert_allclose(enc.encodings_[0], cat_encoded) + + expected_encoding = np.take(cat_encoded, X_int[shuffled_idx, :]) + X_trans = enc.transform(X_input) + assert_allclose(expected_encoding, X_trans) + + # test on unknown category + X_trans = enc.transform(np.array([[5]], dtype=categories.dtype)) + assert_allclose(X_trans, [[y_mean]]) + + +@pytest.mark.parametrize('categories', [ + np.array([0, 1, 2], dtype=int), + np.array(['cat', 'dog', 'snake'], dtype=object), +]) +def test_zero_variance_category(categories): + # When the target is constant for a given category, the category encoding + # should correspond to the that constant target values + X_int = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=int).T + X_input = categories[X_int] + + # The target of the first category are constant and have no variance + y = np.array([10] * 20 + [-4] * 15 + [9] * 15 + [-6] * 30 + [25] * 10) + + enc = TargetRegressorEncoder().fit(X_input, y) + X_test = np.array([[0]], dtype=int).T + X_input = categories[X_test] + X_trans = enc.transform(X_input) + assert_allclose(X_trans, [[10]]) + + +@pytest.mark.parametrize('categories', [ + np.array([0, 1, 2], dtype=int), + np.array(['cat', 'dog', 'snake'], dtype=object), +]) +def test_zero_variance_target(categories): + # if the target has zero variance, then the mean of the target is used. + X_int = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=int).T + X_input = categories[X_int] + n_samples = X_input.shape[0] + + y = np.ones(n_samples) + + enc = TargetRegressorEncoder() + X_trans = enc.fit_transform(X_input, y) + expected_trans = np.full((n_samples, 1), fill_value=y.mean(), dtype=float) + assert_allclose(X_trans, expected_trans) + + +@pytest.mark.parametrize("X, categories", [ + ( + np.array([[0] * 10 + [1] * 10 + [3]], + dtype=int).T, # 3 is unknown + [[0, 1, 2]]), + ( + np.array([['cat'] * 10 + ['dog'] * 10 + ['snake']], + dtype=object).T, # snake is unknown + [['dog', 'cat', 'cow']] + ), +]) +def test_custom_categories(X, categories): + # Test custom categoires with known categories that is unseen during + # fit time + rng = np.random.RandomState(42) + y = rng.uniform(low=-10, high=20, size=X.shape[0]) + + enc = TargetRegressorEncoder(categories=categories) + X_trans = enc.fit_transform(X, y) + + # The last element is unknown + assert_allclose(X_trans[-1], [y.mean()]) + + assert len(enc.encodings_) == 1 + # known category that is unseen during fit time + assert enc.encodings_[0][-1] == pytest.approx(y.mean()) + + +@pytest.mark.parametrize('to_pandas', [True, False]) +def test_multiple_features_sanity(to_pandas): + X = np.array([ + [1, 1], + [0, 1], + [1, 1], + [0, 1], + [1, 0], + [0, 1], + [1, 0], + [0, 0] + ], dtype=int) + y = np.array([0, 1, 2, 3, 4, 5, 10, 7]) + y_mean = np.mean(y) + + X_test = np.array([ + [0, 1], + [1, 0], + [2, 10], # unknown + ], dtype=int) + + if to_pandas: + pd = pytest.importorskip('pandas') + # convert second feature to a object + X_obj = np.array(['cat', 'dog'], dtype=object)[X[:, 1]] + X = pd.DataFrame({ + 'feat0': X[:, 0], 'feat1': X_obj}, columns=['feat0', 'feat1'] + ) + X_test = pd.DataFrame({ + 'feat0': X_test[:, 0], 'feat1': ['dog', 'cat', 'snake'] + }) + + # manually compute multilevel partial pooling + feat_0_cat_0_encoding = ((4 * 4. / 5. + 4. / 9.5) / + (4 / 5. + 1 / 9.5)) + feat_0_cat_1_encoding = ((4 * 4. / 14. + 4. / 9.5) / + (4 / 14. + 1 / 9.5)) + + feat_1_cat_0_encoding = ((3 * 7. / 6. + 4. / 9.5) / + (3 / 6. + 1 / 9.5)) + feat_1_cat_1_encoding = ((5 * 2.2 / 2.96 + 4. / 9.5) / + (5 / 2.96 + 1 / 9.5)) + + expected_encoding = [ + [feat_0_cat_0_encoding, feat_0_cat_1_encoding], + [feat_1_cat_0_encoding, feat_1_cat_1_encoding] + ] + + enc = TargetRegressorEncoder().fit(X, y) + assert_allclose(expected_encoding, enc.encodings_) + assert enc.encoding_mean_ == pytest.approx(y_mean) + + X_trans = enc.transform(X_test) + X_trans_expected = np.array([ + [feat_0_cat_0_encoding, feat_1_cat_1_encoding], + [feat_0_cat_1_encoding, feat_1_cat_0_encoding], + [y_mean, y_mean], # unknown maps to y_mean + ]) + assert_allclose(X_trans, X_trans_expected) diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index b43afa998698b..6e998320e75a8 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -1,10 +1,12 @@ +from contextlib import suppress +from collections import Counter from typing import NamedTuple import numpy as np from . import is_scalar_nan -def _unique(values, *, return_inverse=False): +def _unique(values, *, return_inverse=False, return_counts=False): """Helper function to find unique values with support for python objects. Uses pure python method for object dtype, and numpy method for @@ -18,6 +20,10 @@ def _unique(values, *, return_inverse=False): return_inverse : bool, default=False If True, also return the indices of the unique values. + return_count : bool, default=False + If True, also return the number of times each unique item appears in + values. + Returns ------- unique : ndarray @@ -26,16 +32,35 @@ def _unique(values, *, return_inverse=False): unique_inverse : ndarray The indices to reconstruct the original array from the unique array. Only provided if `return_inverse` is True. + + unique_counts : ndarray + The number of times each of the unique values comes up in the original + array. Only provided if `return_counts` is True. """ if values.dtype == object: - return _unique_python(values, return_inverse=return_inverse) + return _unique_python(values, return_inverse=return_inverse, + return_counts=return_counts) # numerical - out = np.unique(values, return_inverse=return_inverse) + return _unique_np(values, return_inverse=return_inverse, + return_counts=return_counts) + + +def _unique_np(values, return_inverse=False, return_counts=False): + """Helper function to find unique values for numpy arrays that correctly + accounts for nans. See `_unique` documentation for details.""" + uniques = np.unique(values, return_inverse=return_inverse, + return_counts=return_counts) + + inverse, counts = None, None + + if return_counts: + *uniques, counts = uniques if return_inverse: - uniques, inverse = out - else: - uniques = out + *uniques, inverse = uniques + + if return_counts or return_inverse: + uniques = uniques[0] # np.unique will have duplicate missing values at the end of `uniques` # here we clip the nans and remove it from uniques @@ -45,9 +70,19 @@ def _unique(values, *, return_inverse=False): if return_inverse: inverse[inverse > nan_idx] = nan_idx + if return_counts: + counts[nan_idx] = np.sum(counts[nan_idx:]) + counts = counts[:nan_idx + 1] + + ret = (uniques, ) + if return_inverse: - return uniques, inverse - return uniques + ret += (inverse, ) + + if return_counts: + ret += (counts, ) + + return ret[0] if len(ret) == 1 else ret class MissingValues(NamedTuple): @@ -123,7 +158,7 @@ def _map_to_integer(values, uniques): return np.array([table[v] for v in values]) -def _unique_python(values, *, return_inverse): +def _unique_python(values, *, return_inverse, return_counts): # Only used in `_uniques`, see docstring there for details try: uniques_set = set(values) @@ -137,11 +172,15 @@ def _unique_python(values, *, return_inverse): for t in set(type(v) for v in values)) raise TypeError("Encoders require their input to be uniformly " f"strings or numbers. Got {types}") + ret = (uniques, ) if return_inverse: - return uniques, _map_to_integer(values, uniques) + ret += (_map_to_integer(values, uniques), ) - return uniques + if return_counts: + ret += (_get_counts(values, uniques), ) + + return ret[0] if len(ret) == 1 else ret def _encode(values, *, uniques, check_unknown=True): @@ -267,3 +306,53 @@ def is_valid(value): if return_mask: return diff, valid_mask return diff + + +class _NaNCounter(Counter): + """Counter with support for nan values.""" + def __init__(self, items): + super().__init__(self._generate_items(items)) + + def _generate_items(self, items): + """Generate items without nans. Stores the nan counts seperately.""" + for item in items: + if not is_scalar_nan(item): + yield item + continue + if not hasattr(self, 'nan_count'): + self.nan_count = 0 + self.nan_count += 1 + + def __missing__(self, key): + if hasattr(self, 'nan_count') and is_scalar_nan(key): + return self.nan_count + raise KeyError(key) + + +def _get_counts(values, uniques): + """Get the count of each of the `uniques` in `values`. The counts will use + the order passed in by `uniques`. + + For non-object dtypes, `uniques` is assumed to be sorted. + """ + if values.dtype.kind in 'OU': + counter = _NaNCounter(values) + output = np.zeros(len(uniques), dtype=np.int64) + for i, item in enumerate(uniques): + with suppress(KeyError): + output[i] = counter[item] + return output + + unique_values, counts = _unique_np(values, return_counts=True) + uniques_in_values = np.isin(uniques, unique_values, assume_unique=True) + + # If there are nans, they will be mapped to the end. + if np.isnan(unique_values[-1]) and np.isnan(uniques[-1]): + uniques_in_values[-1] = True + + unique_valid_indices = np.searchsorted(unique_values, + uniques[uniques_in_values]) + + output = np.zeros_like(uniques, dtype=np.int64) + output[uniques_in_values] = counts[unique_valid_indices] + return output diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py index 53c380e192341..5670449cd3d2d 100644 --- a/sklearn/utils/tests/test_encode.py +++ b/sklearn/utils/tests/test_encode.py @@ -7,23 +7,44 @@ from sklearn.utils._encode import _unique from sklearn.utils._encode import _encode from sklearn.utils._encode import _check_unknown +from sklearn.utils._encode import _get_counts @pytest.mark.parametrize( "values, expected", [(np.array([2, 1, 3, 1, 3], dtype='int64'), np.array([1, 2, 3], dtype='int64')), + (np.array([2, 1, np.nan, 1, np.nan], dtype='float32'), + np.array([1, 2, np.nan], dtype='float32')), (np.array(['b', 'a', 'c', 'a', 'c'], dtype=object), np.array(['a', 'b', 'c'], dtype=object)), + (np.array(['b', 'a', None, 'a', None], dtype=object), + np.array(['a', 'b', None], dtype=object)), (np.array(['b', 'a', 'c', 'a', 'c']), np.array(['a', 'b', 'c']))], - ids=['int64', 'object', 'str']) + ids=['int64', 'float32-nan', 'object', + 'object-None', 'str']) def test_encode_util(values, expected): uniques = _unique(values) assert_array_equal(uniques, expected) + + result, encoded = _unique(values, return_inverse=True) + assert_array_equal(result, expected) + assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) + encoded = _encode(values, uniques=uniques) assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) + result, counts = _unique(values, return_counts=True) + assert_array_equal(result, expected) + assert_array_equal(counts, np.array([2, 1, 2])) + + result, encoded, counts = _unique(values, return_inverse=True, + return_counts=True) + assert_array_equal(result, expected) + assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) + assert_array_equal(counts, np.array([2, 1, 2])) + def test_encode_with_check_unknown(): # test for the check_unknown parameter of _encode() @@ -206,3 +227,24 @@ def test_check_unknown_with_both_missing_values(): assert np.isnan(diff[1]) assert_array_equal(valid_mask, [False, True, True, True, False, False, False]) + + +@pytest.mark.parametrize("values, uniques, expected_counts", [ + (np.array([1] * 10 + [2] * 4 + [3] * 15), + np.array([1, 2, 3]), [10, 4, 15]), + (np.array([1] * 10 + [2] * 4 + [3] * 15), + np.array([1, 2, 3, 5]), [10, 4, 15, 0]), + (np.array([np.nan] * 10 + [2] * 4 + [3] * 15), + np.array([2, 3, np.nan]), [4, 15, 10]), + (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object), + ['a', 'b', 'c'], [16, 4, 20]), + (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object), + ['c', 'b', 'a'], [20, 4, 16]), + (np.array([np.nan] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object), + ['c', np.nan, 'a'], [20, 4, 16]), + (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object), + ['a', 'b', 'c', 'e'], [16, 4, 20, 0]), +]) +def test_get_counts(values, uniques, expected_counts): + counts = _get_counts(values, uniques) + assert_array_equal(counts, expected_counts)