scikit-learn · thomasjpfan · May 1, 2020 · May 1, 2020 · May 8, 2020 · May 8, 2020
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1415,6 +1415,7 @@ details.
    preprocessing.QuantileTransformer
    preprocessing.RobustScaler
    preprocessing.StandardScaler
+   preprocessing.TargetRegressorEncoder
 
 .. autosummary::
    :toctree: generated/

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -466,6 +466,9 @@ Such features can be efficiently coded as integers, for instance
 ``[0, 1, 3]`` while ``["female", "from Asia", "uses Chrome"]`` would be
 ``[1, 2, 1]``.
 
+OrdinalEncoder
+--------------
+
 To convert categorical features to such integer codes, we can use the
 :class:`OrdinalEncoder`. This estimator transforms each categorical feature to one
 new feature of integers (0 to n_categories - 1)::
@@ -482,6 +485,9 @@ scikit-learn estimators, as these expect continuous input, and would interpret
 the categories as being ordered, which is often not desired (i.e. the set of
 browsers was ordered arbitrarily).
 
+OneHotEncoder
+-------------
+
 Another possibility to convert categorical features to features that can be used
 with scikit-learn estimators is to use a one-of-K, also known as one-hot or
 dummy encoding.
@@ -611,6 +617,68 @@ separate categories::
 See :ref:`dict_feature_extraction` for categorical features that are
 represented as a dict, not as scalars.
 
+.. _target_regressor_encoder:
+
+Target Regressor Encoder
+------------------------
+
+The :class:`~sklearn.preprocessing.TargetRegressorEncoder` uses target
+statistics conditioned on the categorical feature for encoding [FLO]_ [MIC]_.
+This encoding scheme is useful with categorical features with high cardinality,
+where one hot encoding would inflate the feature space making it more expensive
-where one hot encoding would inflate the feature space making it more expensive
+where one-hot encoding would inflate the feature space making it more expensive
-where one hot encoding would inflate the feature space making it more expensive
+where one-hot encoding would inflate the feature space making it more expensive
+for a downstream model to process. A classical example of high cardinality
+categories are location based such as zip code or region. The
+:class:`~sklearn.preprocessing.TargetRegressorEncoder` implementation uses
+partial-pooling estimates from a multilevel model to encode categorical
+features. Specifically, we use a multilevel generalized linear model (GLM)
+approach for estimating the partial-pooling values as shown in Chapter 12.2 of
+[GEL]_:
+
+.. math::
+
+    \alpha_c\approx\frac{\frac{n_c}{\sigma_c^2}\mu_c+
+    \frac{1}{\sigma^2}\mu}{\frac{n_c}{\sigma_c^2} + \frac{1}{\sigma^2}}
+
+where :math:`\alpha_c` is the encoding for category :math:`c`, :math:`n_c` is
+the category count, :math:`\mu_c` is the target mean conditioned on :math:`c`,
+:math:`\sigma_c^2` is the target variance conditioned on :math:`c`, :math:`\mu`
+is the overall target mean, and :math:`\sigma^2` is the overall target
+variance. The averages from categories with smaller counts carry less
+information and the multilevel estimates are pulled closer to the overall mean.
+For categories with larger counts the multilevel estimates are pulled closer to
+the mean conditioned on the category.
+
+For example, the following toy dataset contains two categories where the
+`'cat'` category appears 3 times and the `'dog'` category appears 50 times. The
+encoding for `'cat'` is pulled toward the overall mean of `53` when compared to
+`'dog'` because the `'cat'` category appears less frequently::
+
+    >>> from sklearn.preprocessing import TargetRegressorEncoder
+    >>> X = np.array([['cat'] * 3 + ['dog'] * 50]).T
+    >>> y = np.array([0, 30, 60] + [60, 62] * 25)
+    >>> enc = TargetRegressorEncoder().fit(X, y)
+    >>> enc.transform([['cat'], ['dog']])
+    array([[50...],
+           [60...]])
+    >>> enc.encoding_mean_
+    59...
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_preprocessing_plot_target_regressor_encoder.py`
+
+.. topic:: References
+
+    .. [GEL] Gelman A, Hill J. Data Analysis using Regression and
+       Multilevel/Hierarchical Models. Cambridge University Press, 2007
+    .. [FLO] Pargent, Florian. `A Benchmark Experiment on How to Encode
+       Categorical Features in Predictive Modeling <https://osf.io/356ed/>`
+       OSF, 24 Mar. 2019.
+    .. [MIC] Micci-Barreca D (2001) `A preprocessing scheme for
+       high-cardinality categorical attributes in classification and prediction
+       problems <https://dl.acm.org/doi/abs/10.1145/507533.507538>`, ACM SIGKDD
+       Explorations Newsletter, 3(1), 27-32.
+
 .. _preprocessing_discretization:
 
 Discretization

diff --git a/examples/preprocessing/plot_target_regressor_encoder.py b/examples/preprocessing/plot_target_regressor_encoder.py
@@ -0,0 +1,138 @@
+"""
+=============================
+Target Encoder for Regressors
+=============================
+
+.. currentmodule:: sklearn.preprocessing
+
+The :class:`TargetRegressorEncoder` uses target statistics conditioned on
+the categorical features for encoding. In this example, we will compare
+:class:`TargetRegressorEncoder`, :class:`OrdinalEncoder`, and dropping the
+category on a wine review dataset.
+"""
+
+# %%
+# Loading Data from OpenML
+# ========================
+# First, we load the wine reviews dataset, where the target is the points given
+# be a reviewer:
+import warnings
+from sklearn.datasets import fetch_openml
+
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore", category=UserWarning)
+    wine_reviews = fetch_openml(data_id=42074, as_frame=True)
+
+df = wine_reviews.frame
+df.head()
+
+# %%
+# For this example, we use the following subset of numerical and categorical
+# features in the data. The categorical features have a cardinality ranging
+# from 18 to 14810:
+numerical_features = ['price']
+categorical_features = ['country', 'province', 'region_1', 'region_2',
+                        'variety', 'winery']
+
+X = df[numerical_features + categorical_features]
+y = df['points']
+X.nunique().sort_values(ascending=False)
+
+# %%
+# We split the dataset into a training and test set:
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+
+print(f"Samples in training set: {len(X_train)}\n"
+      f"Samples in test set: {len(X_test)}")
+
+# %%
+# Building and Training Pipelines with Different Encoders
+# =======================================================
+# Dropping the categorical features
+# ---------------------------------
+# As a basline, we construct a pipeline where the categorical features are
+# dropped.
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn import set_config
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.impute import SimpleImputer
+
+set_config(display='diagram')  # Show HTML representation of pipeline
+
+prep = ColumnTransformer([
+    ('num', SimpleImputer(strategy='median'), numerical_features),
+    ('cat', 'drop', categorical_features)
+])
+
+reg_drop_cats = Pipeline([
+    ('prep', prep), ('hist', HistGradientBoostingRegressor())
+])
+reg_drop_cats
+
+# %%
+# Here we train and use the root mean squared error to evalute the baseline
+# model:
+from sklearn.metrics import mean_squared_error
+
+reg_drop_cats.fit(X_train, y_train)
+reg_drop_cats_rmse = mean_squared_error(
+    y_test, reg_drop_cats.predict(X_test)
+)
+print(f"RMSE for dropping categorical features: {reg_drop_cats_rmse:.4}")
+
+# %%
+# Using the OrdinalEncoder
+# ------------------------
+# Since the categorical features have missing values, we impute the feature
+# with `'sk_missing'` before passing it to the :class:`OrdinalEncoder`.
+# The `categories` parameter is constructed such that there are not unknown
+# values during test time
+from sklearn.preprocessing import OrdinalEncoder
+
+categories = [
+    X[feat].fillna("sk_missing").unique() for feat in categorical_features
+]
+
+cat_prep = Pipeline([
+    ('imputer', SimpleImputer(strategy='constant', missing_values=None,
+                              fill_value='sk_missing')),
+    ('encoder', OrdinalEncoder(categories=categories))
+])
+
+
+# %%
+# We modify the original pipeline to use the ordinal categorical preprocessing:
+reg_ordinal = reg_drop_cats.set_params(prep__cat=cat_prep)
+reg_ordinal
+
+# %%
+# When we include the categorical features through ordinal encoding the RMSE
+# improves:
+reg_ordinal.fit(X_train, y_train)
+reg_ordinal_rmse = mean_squared_error(
+    y_test, reg_ordinal.predict(X_test), squared=False
+)
+print(f"RMSE with ordinal encoding: {reg_ordinal_rmse:.4}")
+
+# %%
+# Using the TargetEncoder
+# -----------------------
+# Finally, we replace the ordinal encoder with the
+# :class:`TargetRegressorEncoder`:
+from sklearn.preprocessing import TargetRegressorEncoder
+
+reg_target = reg_ordinal.set_params(
+    prep__cat__encoder=TargetRegressorEncoder())
+reg_target
+
+# %%
+# The :class:`TargetRegressorEncoder` further improves the RMSE:
+reg_target.fit(X_train, y_train)
+reg_target_rmse = mean_squared_error(
+    y_test, reg_target.predict(X_test), squared=False
+)
+print(f"RMSE with target encoding: {reg_target_rmse:.4}")
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
@@ -27,6 +27,7 @@
 
 from ._encoders import OneHotEncoder
 from ._encoders import OrdinalEncoder
+from ._target_encoder import TargetRegressorEncoder
 
 from ._label import label_binarize
 from ._label import LabelBinarizer
@@ -43,6 +44,7 @@
     'KernelCenterer',
     'LabelBinarizer',
     'LabelEncoder',
+    'TargetRegressorEncoder',
     'MultiLabelBinarizer',
     'MinMaxScaler',
     'MaxAbsScaler',

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -11,7 +11,7 @@
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _deprecate_positional_args
 
-from ..utils._encode import _encode, _check_unknown, _unique
+from ..utils._encode import _encode, _check_unknown, _unique, _get_counts
 
 
 __all__ = [
@@ -73,7 +73,8 @@ def _get_feature(self, X, feature_idx):
         # numpy arrays, sparse arrays
         return X[:, feature_idx]
 
-    def _fit(self, X, handle_unknown='error', force_all_finite=True):
+    def _fit(self, X, handle_unknown='error', force_all_finite=True,
+             return_counts=False):
         X_list, n_samples, n_features = self._check_X(
             X, force_all_finite=force_all_finite)
 
@@ -83,11 +84,17 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
                                  " it has to be of shape (n_features,).")
 
         self.categories_ = []
+        category_counts = []
 
         for i in range(n_features):
             Xi = X_list[i]
             if self.categories == 'auto':
-                cats = _unique(Xi)
+                result = _unique(Xi, return_counts=return_counts)
+                if return_counts:
+                    cats, counts = result
+                    category_counts.append(counts)
+                else:
+                    cats = result
             else:
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
                 if Xi.dtype.kind not in 'OU':
@@ -107,8 +114,14 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
                         msg = ("Found unknown categories {0} in column {1}"
                                " during fit".format(diff, i))
                         raise ValueError(msg)
+                if return_counts:
+                    category_counts.append(_get_counts(Xi, cats))
+
             self.categories_.append(cats)
 
+        return {'category_counts': category_counts,
+                'n_samples': n_samples}
+
     def _transform(self, X, handle_unknown='error', force_all_finite=True):
         X_list, n_samples, n_features = self._check_X(
             X, force_all_finite=force_all_finite)