diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 84f8097cbbe9d..9dad582f08000 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1415,6 +1415,7 @@ details.
preprocessing.QuantileTransformer
preprocessing.RobustScaler
preprocessing.StandardScaler
+ preprocessing.TargetRegressorEncoder
.. autosummary::
:toctree: generated/
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 801d9a98ed1f4..af31bb845c44a 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -466,6 +466,9 @@ Such features can be efficiently coded as integers, for instance
``[0, 1, 3]`` while ``["female", "from Asia", "uses Chrome"]`` would be
``[1, 2, 1]``.
+OrdinalEncoder
+--------------
+
To convert categorical features to such integer codes, we can use the
:class:`OrdinalEncoder`. This estimator transforms each categorical feature to one
new feature of integers (0 to n_categories - 1)::
@@ -482,6 +485,9 @@ scikit-learn estimators, as these expect continuous input, and would interpret
the categories as being ordered, which is often not desired (i.e. the set of
browsers was ordered arbitrarily).
+OneHotEncoder
+-------------
+
Another possibility to convert categorical features to features that can be used
with scikit-learn estimators is to use a one-of-K, also known as one-hot or
dummy encoding.
@@ -611,6 +617,68 @@ separate categories::
See :ref:`dict_feature_extraction` for categorical features that are
represented as a dict, not as scalars.
+.. _target_regressor_encoder:
+
+Target Regressor Encoder
+------------------------
+
+The :class:`~sklearn.preprocessing.TargetRegressorEncoder` uses target
+statistics conditioned on the categorical feature for encoding [FLO]_ [MIC]_.
+This encoding scheme is useful with categorical features with high cardinality,
+where one hot encoding would inflate the feature space making it more expensive
+for a downstream model to process. A classical example of high cardinality
+categories are location based such as zip code or region. The
+:class:`~sklearn.preprocessing.TargetRegressorEncoder` implementation uses
+partial-pooling estimates from a multilevel model to encode categorical
+features. Specifically, we use a multilevel generalized linear model (GLM)
+approach for estimating the partial-pooling values as shown in Chapter 12.2 of
+[GEL]_:
+
+.. math::
+
+ \alpha_c\approx\frac{\frac{n_c}{\sigma_c^2}\mu_c+
+ \frac{1}{\sigma^2}\mu}{\frac{n_c}{\sigma_c^2} + \frac{1}{\sigma^2}}
+
+where :math:`\alpha_c` is the encoding for category :math:`c`, :math:`n_c` is
+the category count, :math:`\mu_c` is the target mean conditioned on :math:`c`,
+:math:`\sigma_c^2` is the target variance conditioned on :math:`c`, :math:`\mu`
+is the overall target mean, and :math:`\sigma^2` is the overall target
+variance. The averages from categories with smaller counts carry less
+information and the multilevel estimates are pulled closer to the overall mean.
+For categories with larger counts the multilevel estimates are pulled closer to
+the mean conditioned on the category.
+
+For example, the following toy dataset contains two categories where the
+`'cat'` category appears 3 times and the `'dog'` category appears 50 times. The
+encoding for `'cat'` is pulled toward the overall mean of `53` when compared to
+`'dog'` because the `'cat'` category appears less frequently::
+
+ >>> from sklearn.preprocessing import TargetRegressorEncoder
+ >>> X = np.array([['cat'] * 3 + ['dog'] * 50]).T
+ >>> y = np.array([0, 30, 60] + [60, 62] * 25)
+ >>> enc = TargetRegressorEncoder().fit(X, y)
+ >>> enc.transform([['cat'], ['dog']])
+ array([[50...],
+ [60...]])
+ >>> enc.encoding_mean_
+ 59...
+
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_preprocessing_plot_target_regressor_encoder.py`
+
+.. topic:: References
+
+ .. [GEL] Gelman A, Hill J. Data Analysis using Regression and
+ Multilevel/Hierarchical Models. Cambridge University Press, 2007
+ .. [FLO] Pargent, Florian. `A Benchmark Experiment on How to Encode
+ Categorical Features in Predictive Modeling `
+ OSF, 24 Mar. 2019.
+ .. [MIC] Micci-Barreca D (2001) `A preprocessing scheme for
+ high-cardinality categorical attributes in classification and prediction
+ problems `, ACM SIGKDD
+ Explorations Newsletter, 3(1), 27-32.
+
.. _preprocessing_discretization:
Discretization
diff --git a/examples/preprocessing/plot_target_regressor_encoder.py b/examples/preprocessing/plot_target_regressor_encoder.py
new file mode 100644
index 0000000000000..8182172bbc5c8
--- /dev/null
+++ b/examples/preprocessing/plot_target_regressor_encoder.py
@@ -0,0 +1,138 @@
+"""
+=============================
+Target Encoder for Regressors
+=============================
+
+.. currentmodule:: sklearn.preprocessing
+
+The :class:`TargetRegressorEncoder` uses target statistics conditioned on
+the categorical features for encoding. In this example, we will compare
+:class:`TargetRegressorEncoder`, :class:`OrdinalEncoder`, and dropping the
+category on a wine review dataset.
+"""
+
+# %%
+# Loading Data from OpenML
+# ========================
+# First, we load the wine reviews dataset, where the target is the points given
+# be a reviewer:
+import warnings
+from sklearn.datasets import fetch_openml
+
+with warnings.catch_warnings():
+ warnings.simplefilter("ignore", category=UserWarning)
+ wine_reviews = fetch_openml(data_id=42074, as_frame=True)
+
+df = wine_reviews.frame
+df.head()
+
+# %%
+# For this example, we use the following subset of numerical and categorical
+# features in the data. The categorical features have a cardinality ranging
+# from 18 to 14810:
+numerical_features = ['price']
+categorical_features = ['country', 'province', 'region_1', 'region_2',
+ 'variety', 'winery']
+
+X = df[numerical_features + categorical_features]
+y = df['points']
+X.nunique().sort_values(ascending=False)
+
+# %%
+# We split the dataset into a training and test set:
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+
+print(f"Samples in training set: {len(X_train)}\n"
+ f"Samples in test set: {len(X_test)}")
+
+# %%
+# Building and Training Pipelines with Different Encoders
+# =======================================================
+# Dropping the categorical features
+# ---------------------------------
+# As a basline, we construct a pipeline where the categorical features are
+# dropped.
+from sklearn.experimental import enable_hist_gradient_boosting # noqa
+from sklearn import set_config
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.impute import SimpleImputer
+
+set_config(display='diagram') # Show HTML representation of pipeline
+
+prep = ColumnTransformer([
+ ('num', SimpleImputer(strategy='median'), numerical_features),
+ ('cat', 'drop', categorical_features)
+])
+
+reg_drop_cats = Pipeline([
+ ('prep', prep), ('hist', HistGradientBoostingRegressor())
+])
+reg_drop_cats
+
+# %%
+# Here we train and use the root mean squared error to evalute the baseline
+# model:
+from sklearn.metrics import mean_squared_error
+
+reg_drop_cats.fit(X_train, y_train)
+reg_drop_cats_rmse = mean_squared_error(
+ y_test, reg_drop_cats.predict(X_test)
+)
+print(f"RMSE for dropping categorical features: {reg_drop_cats_rmse:.4}")
+
+# %%
+# Using the OrdinalEncoder
+# ------------------------
+# Since the categorical features have missing values, we impute the feature
+# with `'sk_missing'` before passing it to the :class:`OrdinalEncoder`.
+# The `categories` parameter is constructed such that there are not unknown
+# values during test time
+from sklearn.preprocessing import OrdinalEncoder
+
+categories = [
+ X[feat].fillna("sk_missing").unique() for feat in categorical_features
+]
+
+cat_prep = Pipeline([
+ ('imputer', SimpleImputer(strategy='constant', missing_values=None,
+ fill_value='sk_missing')),
+ ('encoder', OrdinalEncoder(categories=categories))
+])
+
+
+# %%
+# We modify the original pipeline to use the ordinal categorical preprocessing:
+reg_ordinal = reg_drop_cats.set_params(prep__cat=cat_prep)
+reg_ordinal
+
+# %%
+# When we include the categorical features through ordinal encoding the RMSE
+# improves:
+reg_ordinal.fit(X_train, y_train)
+reg_ordinal_rmse = mean_squared_error(
+ y_test, reg_ordinal.predict(X_test), squared=False
+)
+print(f"RMSE with ordinal encoding: {reg_ordinal_rmse:.4}")
+
+# %%
+# Using the TargetEncoder
+# -----------------------
+# Finally, we replace the ordinal encoder with the
+# :class:`TargetRegressorEncoder`:
+from sklearn.preprocessing import TargetRegressorEncoder
+
+reg_target = reg_ordinal.set_params(
+ prep__cat__encoder=TargetRegressorEncoder())
+reg_target
+
+# %%
+# The :class:`TargetRegressorEncoder` further improves the RMSE:
+reg_target.fit(X_train, y_train)
+reg_target_rmse = mean_squared_error(
+ y_test, reg_target.predict(X_test), squared=False
+)
+print(f"RMSE with target encoding: {reg_target_rmse:.4}")
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
index d048b30e1f3d0..03cdb57ba0694 100644
--- a/sklearn/preprocessing/__init__.py
+++ b/sklearn/preprocessing/__init__.py
@@ -27,6 +27,7 @@
from ._encoders import OneHotEncoder
from ._encoders import OrdinalEncoder
+from ._target_encoder import TargetRegressorEncoder
from ._label import label_binarize
from ._label import LabelBinarizer
@@ -43,6 +44,7 @@
'KernelCenterer',
'LabelBinarizer',
'LabelEncoder',
+ 'TargetRegressorEncoder',
'MultiLabelBinarizer',
'MinMaxScaler',
'MaxAbsScaler',
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 6d59cb5a0d858..4b05ffec7bbb7 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -11,7 +11,7 @@
from ..utils.validation import check_is_fitted
from ..utils.validation import _deprecate_positional_args
-from ..utils._encode import _encode, _check_unknown, _unique
+from ..utils._encode import _encode, _check_unknown, _unique, _get_counts
__all__ = [
@@ -73,7 +73,8 @@ def _get_feature(self, X, feature_idx):
# numpy arrays, sparse arrays
return X[:, feature_idx]
- def _fit(self, X, handle_unknown='error', force_all_finite=True):
+ def _fit(self, X, handle_unknown='error', force_all_finite=True,
+ return_counts=False):
X_list, n_samples, n_features = self._check_X(
X, force_all_finite=force_all_finite)
@@ -83,11 +84,17 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
" it has to be of shape (n_features,).")
self.categories_ = []
+ category_counts = []
for i in range(n_features):
Xi = X_list[i]
if self.categories == 'auto':
- cats = _unique(Xi)
+ result = _unique(Xi, return_counts=return_counts)
+ if return_counts:
+ cats, counts = result
+ category_counts.append(counts)
+ else:
+ cats = result
else:
cats = np.array(self.categories[i], dtype=Xi.dtype)
if Xi.dtype.kind not in 'OU':
@@ -107,8 +114,14 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
msg = ("Found unknown categories {0} in column {1}"
" during fit".format(diff, i))
raise ValueError(msg)
+ if return_counts:
+ category_counts.append(_get_counts(Xi, cats))
+
self.categories_.append(cats)
+ return {'category_counts': category_counts,
+ 'n_samples': n_samples}
+
def _transform(self, X, handle_unknown='error', force_all_finite=True):
X_list, n_samples, n_features = self._check_X(
X, force_all_finite=force_all_finite)
diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py
new file mode 100644
index 0000000000000..7fefab56f68bd
--- /dev/null
+++ b/sklearn/preprocessing/_target_encoder.py
@@ -0,0 +1,168 @@
+import numpy as np
+
+from ..utils.validation import check_is_fitted
+from ..utils.validation import column_or_1d
+from ._encoders import _BaseEncoder
+
+
+class TargetRegressorEncoder(_BaseEncoder):
+ """Target Encoder for Regression Targets.
+
+ Each category is encoded based on its effect on the target variable. The
+ encoding scheme takes a weighted average estimated by a multilevel
+ linear model.
+
+ Read more in the :ref:`User Guide `.
+
+ Parameters
+ ----------
+ categories : 'auto' or a list of array-like, default='auto'
+ Categories (unique values) per feature:
+
+ - 'auto' : Determine categories automatically from the training data.
+ - list : `categories[i]` holds the categories expected in the ith
+ column. The passed categories should not mix strings and numeric
+ values within a single feature, and should be sorted in case of
+ numeric values in ascending order.
+
+ The used categories can be found in the `categories_` attribute.
+
+ Attributes
+ ----------
+ encodings_ : list of shape (n_features,) of ndarray
+ For feature `i`, `encodings_[i]` is the encoding matching the
+ categories listed in `categories_[i]`.
+
+ categories_ : list of shape (n_features,) of ndarray
+ The categories of each feature determined during fitting
+ (in order of the features in X and corresponding with the output
+ of :meth:`transform`).
+
+ encoding_mean_ : float
+ The overall mean of the target.
+
+ See Also
+ --------
+ sklearn.preprocessing.OrdinalEncoder : Performs an ordinal (integer)
+ encoding of the categorical features.
+ sklearn.preprocessing.OneHotEncoder : Performs a one-hot encoding of
+ categorical features.
+ """
+ def __init__(self, categories='auto'):
+ self.categories = categories
+
+ def fit(self, X, y):
+ """Fit the TargetRegressorEncoder to X.
+
+ Parameters
+ ----------
+ X : array-like of shape (n_samples, n_features)
+ The data to determine the categories of each feature.
+
+ y : array-like of shape (n_samples,)
+ Target values.
+
+ Returns
+ -------
+ self : object
+ """
+ self._fit(X, y)
+ return self
+
+ def transform(self, X, y=None):
+ """Encodes X.
+
+ Parameters
+ ----------
+ X : array-like of shape (n_samples, n_features)
+ The data to encode.
+
+ Returns
+ -------
+ X_out : ndarray of shape (n_samples, n_features)
+ Transformed input.
+ """
+ check_is_fitted(self)
+ X_int, X_known = self._transform(X, handle_unknown='ignore',
+ force_all_finite='allow-nan')
+ return self._transform_X_int(X_int, X_known)
+
+ def fit_transform(self, X, y):
+ """Fit the encoder and encodes `X`.
+
+ Parameters
+ ----------
+ X : array-like of shape (n_samples, n_features)
+ The data to encode.
+
+ y : array-like of shape (n_samples,)
+ Target values.
+
+ Returns
+ -------
+ X_out : ndarray of shape (n_samples, n_features)
+ Transformed input.
+ """
+ X_int, X_known = self._fit(X, y)
+ return self._transform_X_int(X_int, X_known)
+
+ def _transform_X_int(self, X_int, X_known):
+ """Transform integer encoded X. `X_int` and `X_known` are returned
+ by `_transform`.
+ """
+ out = np.empty_like(X_int, dtype=float)
+ for i, cat_encoding in enumerate(self.encodings_):
+ np.take(cat_encoding, X_int[:, i], out=out[:, i])
+ out[~X_known[:, i], i] = self.encoding_mean_
+ return out
+
+ def _fit(self, X, y):
+ """Fit the encoder"""
+ y = column_or_1d(y, warn=True)
+ fit_results = super()._fit(X, handle_unknown='ignore',
+ return_counts=True,
+ force_all_finite='allow-nan')
+ X_int, X_known = self._transform(X, handle_unknown='ignore',
+ force_all_finite='allow-nan')
+ # Makes sure unknown categories are not used fot fitting
+ X_int[~X_known] = -1
+ self.encoding_mean_ = y_mean = np.mean(y)
+
+ # y is constant the encoding will be the constant
+ if np.ptp(y) == 0.0:
+ self.encodings_ = [
+ np.full(len(cat), fill_value=y_mean, dtype=float)
+ for cat in self.categories_]
+ return X_int, X_known
+
+ y_variance = np.var(y)
+
+ n_samples, n_features = X_int.shape
+ # temporary location to store masks when encoding each category
+ tmp_mask = np.zeros(n_samples, dtype=bool)
+
+ cat_encodings = []
+ category_counts = fit_results['category_counts']
+
+ for i in range(n_features):
+ n_cats = len(self.categories_[i])
+ cat_means = np.zeros(n_cats, dtype=float)
+ cat_var_ratio = np.ones(n_cats, dtype=float)
+
+ for encoding in range(n_cats):
+ np.equal(X_int[:, i], encoding, out=tmp_mask)
+ y_tmp = y[tmp_mask]
+ if y_tmp.size:
+ cat_means[encoding] = np.mean(y_tmp)
+ cat_var_ratio[encoding] = np.var(y_tmp)
+
+ # partial-pooling estimates
+ cat_counts = category_counts[i]
+ cat_var_ratio /= y_variance
+
+ cat_encoded = cat_counts * cat_means + cat_var_ratio * y_mean
+ cat_encoded /= cat_counts + cat_var_ratio
+ cat_encodings.append(cat_encoded)
+
+ self.encodings_ = cat_encodings
+ return X_int, X_known
diff --git a/sklearn/preprocessing/tests/test_target_encoder.py b/sklearn/preprocessing/tests/test_target_encoder.py
new file mode 100644
index 0000000000000..e4493875e6783
--- /dev/null
+++ b/sklearn/preprocessing/tests/test_target_encoder.py
@@ -0,0 +1,180 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_allclose
+
+from sklearn.preprocessing import TargetRegressorEncoder
+
+
+@pytest.mark.parametrize('categories', [
+ np.array([0, 1, 2], dtype=int),
+ np.array(['cat', 'dog', 'snake'], dtype=object)
+])
+@pytest.mark.parametrize('seed', range(3))
+def test_regression(categories, seed):
+ # checks impact encoder for regression
+
+ X_int = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=int).T
+ X_input = categories[X_int]
+ n_samples = X_input.shape[0]
+ rng = np.random.RandomState(seed)
+ y = rng.uniform(low=-10, high=20, size=n_samples)
+
+ # multilevel partial pooling
+ y_sections = [y[:20], y[20:50], y[50:]]
+ cat_means = np.array([np.mean(sect) for sect in y_sections])
+ cat_variance = np.array([np.var(sect) for sect in y_sections])
+ cat_counts = np.array([20, 30, 40])
+
+ y_variance = np.var(y)
+ y_mean = np.mean(y)
+
+ # multilevel partial pooling directly
+ cat_encoded = ((cat_counts * cat_means / cat_variance +
+ y_mean / y_variance))
+ cat_encoded /= (cat_counts / cat_variance + 1 / y_variance)
+
+ # shuffle
+ shuffled_idx = rng.permutation(n_samples)
+ X_input = X_input[shuffled_idx]
+ y = y[shuffled_idx]
+
+ enc = TargetRegressorEncoder().fit(X_input, y)
+
+ assert len(enc.encodings_) == 1
+ assert enc.encoding_mean_ == pytest.approx(y_mean)
+ assert_allclose(enc.encodings_[0], cat_encoded)
+
+ expected_encoding = np.take(cat_encoded, X_int[shuffled_idx, :])
+ X_trans = enc.transform(X_input)
+ assert_allclose(expected_encoding, X_trans)
+
+ # test on unknown category
+ X_trans = enc.transform(np.array([[5]], dtype=categories.dtype))
+ assert_allclose(X_trans, [[y_mean]])
+
+
+@pytest.mark.parametrize('categories', [
+ np.array([0, 1, 2], dtype=int),
+ np.array(['cat', 'dog', 'snake'], dtype=object),
+])
+def test_zero_variance_category(categories):
+ # When the target is constant for a given category, the category encoding
+ # should correspond to the that constant target values
+ X_int = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=int).T
+ X_input = categories[X_int]
+
+ # The target of the first category are constant and have no variance
+ y = np.array([10] * 20 + [-4] * 15 + [9] * 15 + [-6] * 30 + [25] * 10)
+
+ enc = TargetRegressorEncoder().fit(X_input, y)
+ X_test = np.array([[0]], dtype=int).T
+ X_input = categories[X_test]
+ X_trans = enc.transform(X_input)
+ assert_allclose(X_trans, [[10]])
+
+
+@pytest.mark.parametrize('categories', [
+ np.array([0, 1, 2], dtype=int),
+ np.array(['cat', 'dog', 'snake'], dtype=object),
+])
+def test_zero_variance_target(categories):
+ # if the target has zero variance, then the mean of the target is used.
+ X_int = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=int).T
+ X_input = categories[X_int]
+ n_samples = X_input.shape[0]
+
+ y = np.ones(n_samples)
+
+ enc = TargetRegressorEncoder()
+ X_trans = enc.fit_transform(X_input, y)
+ expected_trans = np.full((n_samples, 1), fill_value=y.mean(), dtype=float)
+ assert_allclose(X_trans, expected_trans)
+
+
+@pytest.mark.parametrize("X, categories", [
+ (
+ np.array([[0] * 10 + [1] * 10 + [3]],
+ dtype=int).T, # 3 is unknown
+ [[0, 1, 2]]),
+ (
+ np.array([['cat'] * 10 + ['dog'] * 10 + ['snake']],
+ dtype=object).T, # snake is unknown
+ [['dog', 'cat', 'cow']]
+ ),
+])
+def test_custom_categories(X, categories):
+ # Test custom categoires with known categories that is unseen during
+ # fit time
+ rng = np.random.RandomState(42)
+ y = rng.uniform(low=-10, high=20, size=X.shape[0])
+
+ enc = TargetRegressorEncoder(categories=categories)
+ X_trans = enc.fit_transform(X, y)
+
+ # The last element is unknown
+ assert_allclose(X_trans[-1], [y.mean()])
+
+ assert len(enc.encodings_) == 1
+ # known category that is unseen during fit time
+ assert enc.encodings_[0][-1] == pytest.approx(y.mean())
+
+
+@pytest.mark.parametrize('to_pandas', [True, False])
+def test_multiple_features_sanity(to_pandas):
+ X = np.array([
+ [1, 1],
+ [0, 1],
+ [1, 1],
+ [0, 1],
+ [1, 0],
+ [0, 1],
+ [1, 0],
+ [0, 0]
+ ], dtype=int)
+ y = np.array([0, 1, 2, 3, 4, 5, 10, 7])
+ y_mean = np.mean(y)
+
+ X_test = np.array([
+ [0, 1],
+ [1, 0],
+ [2, 10], # unknown
+ ], dtype=int)
+
+ if to_pandas:
+ pd = pytest.importorskip('pandas')
+ # convert second feature to a object
+ X_obj = np.array(['cat', 'dog'], dtype=object)[X[:, 1]]
+ X = pd.DataFrame({
+ 'feat0': X[:, 0], 'feat1': X_obj}, columns=['feat0', 'feat1']
+ )
+ X_test = pd.DataFrame({
+ 'feat0': X_test[:, 0], 'feat1': ['dog', 'cat', 'snake']
+ })
+
+ # manually compute multilevel partial pooling
+ feat_0_cat_0_encoding = ((4 * 4. / 5. + 4. / 9.5) /
+ (4 / 5. + 1 / 9.5))
+ feat_0_cat_1_encoding = ((4 * 4. / 14. + 4. / 9.5) /
+ (4 / 14. + 1 / 9.5))
+
+ feat_1_cat_0_encoding = ((3 * 7. / 6. + 4. / 9.5) /
+ (3 / 6. + 1 / 9.5))
+ feat_1_cat_1_encoding = ((5 * 2.2 / 2.96 + 4. / 9.5) /
+ (5 / 2.96 + 1 / 9.5))
+
+ expected_encoding = [
+ [feat_0_cat_0_encoding, feat_0_cat_1_encoding],
+ [feat_1_cat_0_encoding, feat_1_cat_1_encoding]
+ ]
+
+ enc = TargetRegressorEncoder().fit(X, y)
+ assert_allclose(expected_encoding, enc.encodings_)
+ assert enc.encoding_mean_ == pytest.approx(y_mean)
+
+ X_trans = enc.transform(X_test)
+ X_trans_expected = np.array([
+ [feat_0_cat_0_encoding, feat_1_cat_1_encoding],
+ [feat_0_cat_1_encoding, feat_1_cat_0_encoding],
+ [y_mean, y_mean], # unknown maps to y_mean
+ ])
+ assert_allclose(X_trans, X_trans_expected)
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index b43afa998698b..6e998320e75a8 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -1,10 +1,12 @@
+from contextlib import suppress
+from collections import Counter
from typing import NamedTuple
import numpy as np
from . import is_scalar_nan
-def _unique(values, *, return_inverse=False):
+def _unique(values, *, return_inverse=False, return_counts=False):
"""Helper function to find unique values with support for python objects.
Uses pure python method for object dtype, and numpy method for
@@ -18,6 +20,10 @@ def _unique(values, *, return_inverse=False):
return_inverse : bool, default=False
If True, also return the indices of the unique values.
+ return_count : bool, default=False
+ If True, also return the number of times each unique item appears in
+ values.
+
Returns
-------
unique : ndarray
@@ -26,16 +32,35 @@ def _unique(values, *, return_inverse=False):
unique_inverse : ndarray
The indices to reconstruct the original array from the unique array.
Only provided if `return_inverse` is True.
+
+ unique_counts : ndarray
+ The number of times each of the unique values comes up in the original
+ array. Only provided if `return_counts` is True.
"""
if values.dtype == object:
- return _unique_python(values, return_inverse=return_inverse)
+ return _unique_python(values, return_inverse=return_inverse,
+ return_counts=return_counts)
# numerical
- out = np.unique(values, return_inverse=return_inverse)
+ return _unique_np(values, return_inverse=return_inverse,
+ return_counts=return_counts)
+
+
+def _unique_np(values, return_inverse=False, return_counts=False):
+ """Helper function to find unique values for numpy arrays that correctly
+ accounts for nans. See `_unique` documentation for details."""
+ uniques = np.unique(values, return_inverse=return_inverse,
+ return_counts=return_counts)
+
+ inverse, counts = None, None
+
+ if return_counts:
+ *uniques, counts = uniques
if return_inverse:
- uniques, inverse = out
- else:
- uniques = out
+ *uniques, inverse = uniques
+
+ if return_counts or return_inverse:
+ uniques = uniques[0]
# np.unique will have duplicate missing values at the end of `uniques`
# here we clip the nans and remove it from uniques
@@ -45,9 +70,19 @@ def _unique(values, *, return_inverse=False):
if return_inverse:
inverse[inverse > nan_idx] = nan_idx
+ if return_counts:
+ counts[nan_idx] = np.sum(counts[nan_idx:])
+ counts = counts[:nan_idx + 1]
+
+ ret = (uniques, )
+
if return_inverse:
- return uniques, inverse
- return uniques
+ ret += (inverse, )
+
+ if return_counts:
+ ret += (counts, )
+
+ return ret[0] if len(ret) == 1 else ret
class MissingValues(NamedTuple):
@@ -123,7 +158,7 @@ def _map_to_integer(values, uniques):
return np.array([table[v] for v in values])
-def _unique_python(values, *, return_inverse):
+def _unique_python(values, *, return_inverse, return_counts):
# Only used in `_uniques`, see docstring there for details
try:
uniques_set = set(values)
@@ -137,11 +172,15 @@ def _unique_python(values, *, return_inverse):
for t in set(type(v) for v in values))
raise TypeError("Encoders require their input to be uniformly "
f"strings or numbers. Got {types}")
+ ret = (uniques, )
if return_inverse:
- return uniques, _map_to_integer(values, uniques)
+ ret += (_map_to_integer(values, uniques), )
- return uniques
+ if return_counts:
+ ret += (_get_counts(values, uniques), )
+
+ return ret[0] if len(ret) == 1 else ret
def _encode(values, *, uniques, check_unknown=True):
@@ -267,3 +306,53 @@ def is_valid(value):
if return_mask:
return diff, valid_mask
return diff
+
+
+class _NaNCounter(Counter):
+ """Counter with support for nan values."""
+ def __init__(self, items):
+ super().__init__(self._generate_items(items))
+
+ def _generate_items(self, items):
+ """Generate items without nans. Stores the nan counts seperately."""
+ for item in items:
+ if not is_scalar_nan(item):
+ yield item
+ continue
+ if not hasattr(self, 'nan_count'):
+ self.nan_count = 0
+ self.nan_count += 1
+
+ def __missing__(self, key):
+ if hasattr(self, 'nan_count') and is_scalar_nan(key):
+ return self.nan_count
+ raise KeyError(key)
+
+
+def _get_counts(values, uniques):
+ """Get the count of each of the `uniques` in `values`. The counts will use
+ the order passed in by `uniques`.
+
+ For non-object dtypes, `uniques` is assumed to be sorted.
+ """
+ if values.dtype.kind in 'OU':
+ counter = _NaNCounter(values)
+ output = np.zeros(len(uniques), dtype=np.int64)
+ for i, item in enumerate(uniques):
+ with suppress(KeyError):
+ output[i] = counter[item]
+ return output
+
+ unique_values, counts = _unique_np(values, return_counts=True)
+ uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
+
+ # If there are nans, they will be mapped to the end.
+ if np.isnan(unique_values[-1]) and np.isnan(uniques[-1]):
+ uniques_in_values[-1] = True
+
+ unique_valid_indices = np.searchsorted(unique_values,
+ uniques[uniques_in_values])
+
+ output = np.zeros_like(uniques, dtype=np.int64)
+ output[uniques_in_values] = counts[unique_valid_indices]
+ return output
diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py
index 53c380e192341..5670449cd3d2d 100644
--- a/sklearn/utils/tests/test_encode.py
+++ b/sklearn/utils/tests/test_encode.py
@@ -7,23 +7,44 @@
from sklearn.utils._encode import _unique
from sklearn.utils._encode import _encode
from sklearn.utils._encode import _check_unknown
+from sklearn.utils._encode import _get_counts
@pytest.mark.parametrize(
"values, expected",
[(np.array([2, 1, 3, 1, 3], dtype='int64'),
np.array([1, 2, 3], dtype='int64')),
+ (np.array([2, 1, np.nan, 1, np.nan], dtype='float32'),
+ np.array([1, 2, np.nan], dtype='float32')),
(np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
np.array(['a', 'b', 'c'], dtype=object)),
+ (np.array(['b', 'a', None, 'a', None], dtype=object),
+ np.array(['a', 'b', None], dtype=object)),
(np.array(['b', 'a', 'c', 'a', 'c']),
np.array(['a', 'b', 'c']))],
- ids=['int64', 'object', 'str'])
+ ids=['int64', 'float32-nan', 'object',
+ 'object-None', 'str'])
def test_encode_util(values, expected):
uniques = _unique(values)
assert_array_equal(uniques, expected)
+
+ result, encoded = _unique(values, return_inverse=True)
+ assert_array_equal(result, expected)
+ assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+
encoded = _encode(values, uniques=uniques)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+ result, counts = _unique(values, return_counts=True)
+ assert_array_equal(result, expected)
+ assert_array_equal(counts, np.array([2, 1, 2]))
+
+ result, encoded, counts = _unique(values, return_inverse=True,
+ return_counts=True)
+ assert_array_equal(result, expected)
+ assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+ assert_array_equal(counts, np.array([2, 1, 2]))
+
def test_encode_with_check_unknown():
# test for the check_unknown parameter of _encode()
@@ -206,3 +227,24 @@ def test_check_unknown_with_both_missing_values():
assert np.isnan(diff[1])
assert_array_equal(valid_mask,
[False, True, True, True, False, False, False])
+
+
+@pytest.mark.parametrize("values, uniques, expected_counts", [
+ (np.array([1] * 10 + [2] * 4 + [3] * 15),
+ np.array([1, 2, 3]), [10, 4, 15]),
+ (np.array([1] * 10 + [2] * 4 + [3] * 15),
+ np.array([1, 2, 3, 5]), [10, 4, 15, 0]),
+ (np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
+ np.array([2, 3, np.nan]), [4, 15, 10]),
+ (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+ ['a', 'b', 'c'], [16, 4, 20]),
+ (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+ ['c', 'b', 'a'], [20, 4, 16]),
+ (np.array([np.nan] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+ ['c', np.nan, 'a'], [20, 4, 16]),
+ (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+ ['a', 'b', 'c', 'e'], [16, 4, 20, 0]),
+])
+def test_get_counts(values, uniques, expected_counts):
+ counts = _get_counts(values, uniques)
+ assert_array_equal(counts, expected_counts)