diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 84f8097cbbe9d..9dad582f08000 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1415,6 +1415,7 @@ details.
    preprocessing.QuantileTransformer
    preprocessing.RobustScaler
    preprocessing.StandardScaler
+   preprocessing.TargetRegressorEncoder
 
 .. autosummary::
    :toctree: generated/
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 801d9a98ed1f4..af31bb845c44a 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -466,6 +466,9 @@ Such features can be efficiently coded as integers, for instance
 ``[0, 1, 3]`` while ``["female", "from Asia", "uses Chrome"]`` would be
 ``[1, 2, 1]``.
 
+OrdinalEncoder
+--------------
+
 To convert categorical features to such integer codes, we can use the
 :class:`OrdinalEncoder`. This estimator transforms each categorical feature to one
 new feature of integers (0 to n_categories - 1)::
@@ -482,6 +485,9 @@ scikit-learn estimators, as these expect continuous input, and would interpret
 the categories as being ordered, which is often not desired (i.e. the set of
 browsers was ordered arbitrarily).
 
+OneHotEncoder
+-------------
+
 Another possibility to convert categorical features to features that can be used
 with scikit-learn estimators is to use a one-of-K, also known as one-hot or
 dummy encoding.
@@ -611,6 +617,68 @@ separate categories::
 See :ref:`dict_feature_extraction` for categorical features that are
 represented as a dict, not as scalars.
 
+.. _target_regressor_encoder:
+
+Target Regressor Encoder
+------------------------
+
+The :class:`~sklearn.preprocessing.TargetRegressorEncoder` uses target
+statistics conditioned on the categorical feature for encoding [FLO]_ [MIC]_.
+This encoding scheme is useful with categorical features with high cardinality,
+where one hot encoding would inflate the feature space making it more expensive
+for a downstream model to process. A classical example of high cardinality
+categories are location based such as zip code or region. The
+:class:`~sklearn.preprocessing.TargetRegressorEncoder` implementation uses
+partial-pooling estimates from a multilevel model to encode categorical
+features. Specifically, we use a multilevel generalized linear model (GLM)
+approach for estimating the partial-pooling values as shown in Chapter 12.2 of
+[GEL]_:
+
+.. math::
+
+    \alpha_c\approx\frac{\frac{n_c}{\sigma_c^2}\mu_c+
+    \frac{1}{\sigma^2}\mu}{\frac{n_c}{\sigma_c^2} + \frac{1}{\sigma^2}}
+
+where :math:`\alpha_c` is the encoding for category :math:`c`, :math:`n_c` is
+the category count, :math:`\mu_c` is the target mean conditioned on :math:`c`,
+:math:`\sigma_c^2` is the target variance conditioned on :math:`c`, :math:`\mu`
+is the overall target mean, and :math:`\sigma^2` is the overall target
+variance. The averages from categories with smaller counts carry less
+information and the multilevel estimates are pulled closer to the overall mean.
+For categories with larger counts the multilevel estimates are pulled closer to
+the mean conditioned on the category.
+
+For example, the following toy dataset contains two categories where the
+`'cat'` category appears 3 times and the `'dog'` category appears 50 times. The
+encoding for `'cat'` is pulled toward the overall mean of `53` when compared to
+`'dog'` because the `'cat'` category appears less frequently::
+
+    >>> from sklearn.preprocessing import TargetRegressorEncoder
+    >>> X = np.array([['cat'] * 3 + ['dog'] * 50]).T
+    >>> y = np.array([0, 30, 60] + [60, 62] * 25)
+    >>> enc = TargetRegressorEncoder().fit(X, y)
+    >>> enc.transform([['cat'], ['dog']])
+    array([[50...],
+           [60...]])
+    >>> enc.encoding_mean_
+    59...
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_preprocessing_plot_target_regressor_encoder.py`
+
+.. topic:: References
+
+    .. [GEL] Gelman A, Hill J. Data Analysis using Regression and
+       Multilevel/Hierarchical Models. Cambridge University Press, 2007
+    .. [FLO] Pargent, Florian. `A Benchmark Experiment on How to Encode
+       Categorical Features in Predictive Modeling <https://osf.io/356ed/>`
+       OSF, 24 Mar. 2019.
+    .. [MIC] Micci-Barreca D (2001) `A preprocessing scheme for
+       high-cardinality categorical attributes in classification and prediction
+       problems <https://dl.acm.org/doi/abs/10.1145/507533.507538>`, ACM SIGKDD
+       Explorations Newsletter, 3(1), 27-32.
+
 .. _preprocessing_discretization:
 
 Discretization
diff --git a/examples/preprocessing/plot_target_regressor_encoder.py b/examples/preprocessing/plot_target_regressor_encoder.py
new file mode 100644
index 0000000000000..8182172bbc5c8
--- /dev/null
+++ b/examples/preprocessing/plot_target_regressor_encoder.py
@@ -0,0 +1,138 @@
+"""
+=============================
+Target Encoder for Regressors
+=============================
+
+.. currentmodule:: sklearn.preprocessing
+
+The :class:`TargetRegressorEncoder` uses target statistics conditioned on
+the categorical features for encoding. In this example, we will compare
+:class:`TargetRegressorEncoder`, :class:`OrdinalEncoder`, and dropping the
+category on a wine review dataset.
+"""
+
+# %%
+# Loading Data from OpenML
+# ========================
+# First, we load the wine reviews dataset, where the target is the points given
+# be a reviewer:
+import warnings
+from sklearn.datasets import fetch_openml
+
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore", category=UserWarning)
+    wine_reviews = fetch_openml(data_id=42074, as_frame=True)
+
+df = wine_reviews.frame
+df.head()
+
+# %%
+# For this example, we use the following subset of numerical and categorical
+# features in the data. The categorical features have a cardinality ranging
+# from 18 to 14810:
+numerical_features = ['price']
+categorical_features = ['country', 'province', 'region_1', 'region_2',
+                        'variety', 'winery']
+
+X = df[numerical_features + categorical_features]
+y = df['points']
+X.nunique().sort_values(ascending=False)
+
+# %%
+# We split the dataset into a training and test set:
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+
+print(f"Samples in training set: {len(X_train)}\n"
+      f"Samples in test set: {len(X_test)}")
+
+# %%
+# Building and Training Pipelines with Different Encoders
+# =======================================================
+# Dropping the categorical features
+# ---------------------------------
+# As a basline, we construct a pipeline where the categorical features are
+# dropped.
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn import set_config
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.impute import SimpleImputer
+
+set_config(display='diagram')  # Show HTML representation of pipeline
+
+prep = ColumnTransformer([
+    ('num', SimpleImputer(strategy='median'), numerical_features),
+    ('cat', 'drop', categorical_features)
+])
+
+reg_drop_cats = Pipeline([
+    ('prep', prep), ('hist', HistGradientBoostingRegressor())
+])
+reg_drop_cats
+
+# %%
+# Here we train and use the root mean squared error to evalute the baseline
+# model:
+from sklearn.metrics import mean_squared_error
+
+reg_drop_cats.fit(X_train, y_train)
+reg_drop_cats_rmse = mean_squared_error(
+    y_test, reg_drop_cats.predict(X_test)
+)
+print(f"RMSE for dropping categorical features: {reg_drop_cats_rmse:.4}")
+
+# %%
+# Using the OrdinalEncoder
+# ------------------------
+# Since the categorical features have missing values, we impute the feature
+# with `'sk_missing'` before passing it to the :class:`OrdinalEncoder`.
+# The `categories` parameter is constructed such that there are not unknown
+# values during test time
+from sklearn.preprocessing import OrdinalEncoder
+
+categories = [
+    X[feat].fillna("sk_missing").unique() for feat in categorical_features
+]
+
+cat_prep = Pipeline([
+    ('imputer', SimpleImputer(strategy='constant', missing_values=None,
+                              fill_value='sk_missing')),
+    ('encoder', OrdinalEncoder(categories=categories))
+])
+
+
+# %%
+# We modify the original pipeline to use the ordinal categorical preprocessing:
+reg_ordinal = reg_drop_cats.set_params(prep__cat=cat_prep)
+reg_ordinal
+
+# %%
+# When we include the categorical features through ordinal encoding the RMSE
+# improves:
+reg_ordinal.fit(X_train, y_train)
+reg_ordinal_rmse = mean_squared_error(
+    y_test, reg_ordinal.predict(X_test), squared=False
+)
+print(f"RMSE with ordinal encoding: {reg_ordinal_rmse:.4}")
+
+# %%
+# Using the TargetEncoder
+# -----------------------
+# Finally, we replace the ordinal encoder with the
+# :class:`TargetRegressorEncoder`:
+from sklearn.preprocessing import TargetRegressorEncoder
+
+reg_target = reg_ordinal.set_params(
+    prep__cat__encoder=TargetRegressorEncoder())
+reg_target
+
+# %%
+# The :class:`TargetRegressorEncoder` further improves the RMSE:
+reg_target.fit(X_train, y_train)
+reg_target_rmse = mean_squared_error(
+    y_test, reg_target.predict(X_test), squared=False
+)
+print(f"RMSE with target encoding: {reg_target_rmse:.4}")
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
index d048b30e1f3d0..03cdb57ba0694 100644
--- a/sklearn/preprocessing/__init__.py
+++ b/sklearn/preprocessing/__init__.py
@@ -27,6 +27,7 @@
 
 from ._encoders import OneHotEncoder
 from ._encoders import OrdinalEncoder
+from ._target_encoder import TargetRegressorEncoder
 
 from ._label import label_binarize
 from ._label import LabelBinarizer
@@ -43,6 +44,7 @@
     'KernelCenterer',
     'LabelBinarizer',
     'LabelEncoder',
+    'TargetRegressorEncoder',
     'MultiLabelBinarizer',
     'MinMaxScaler',
     'MaxAbsScaler',
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 6d59cb5a0d858..4b05ffec7bbb7 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -11,7 +11,7 @@
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _deprecate_positional_args
 
-from ..utils._encode import _encode, _check_unknown, _unique
+from ..utils._encode import _encode, _check_unknown, _unique, _get_counts
 
 
 __all__ = [
@@ -73,7 +73,8 @@ def _get_feature(self, X, feature_idx):
         # numpy arrays, sparse arrays
         return X[:, feature_idx]
 
-    def _fit(self, X, handle_unknown='error', force_all_finite=True):
+    def _fit(self, X, handle_unknown='error', force_all_finite=True,
+             return_counts=False):
         X_list, n_samples, n_features = self._check_X(
             X, force_all_finite=force_all_finite)
 
@@ -83,11 +84,17 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
                                  " it has to be of shape (n_features,).")
 
         self.categories_ = []
+        category_counts = []
 
         for i in range(n_features):
             Xi = X_list[i]
             if self.categories == 'auto':
-                cats = _unique(Xi)
+                result = _unique(Xi, return_counts=return_counts)
+                if return_counts:
+                    cats, counts = result
+                    category_counts.append(counts)
+                else:
+                    cats = result
             else:
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
                 if Xi.dtype.kind not in 'OU':
@@ -107,8 +114,14 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
                         msg = ("Found unknown categories {0} in column {1}"
                                " during fit".format(diff, i))
                         raise ValueError(msg)
+                if return_counts:
+                    category_counts.append(_get_counts(Xi, cats))
+
             self.categories_.append(cats)
 
+        return {'category_counts': category_counts,
+                'n_samples': n_samples}
+
     def _transform(self, X, handle_unknown='error', force_all_finite=True):
         X_list, n_samples, n_features = self._check_X(
             X, force_all_finite=force_all_finite)
diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py
new file mode 100644
index 0000000000000..7fefab56f68bd
--- /dev/null
+++ b/sklearn/preprocessing/_target_encoder.py
@@ -0,0 +1,168 @@
+import numpy as np
+
+from ..utils.validation import check_is_fitted
+from ..utils.validation import column_or_1d
+from ._encoders import _BaseEncoder
+
+
+class TargetRegressorEncoder(_BaseEncoder):
+    """Target Encoder for Regression Targets.
+
+    Each category is encoded based on its effect on the target variable. The
+    encoding scheme takes a weighted average estimated by a multilevel
+    linear model.
+
+    Read more in the :ref:`User Guide <target_regressor_encoder>`.
+
+    Parameters
+    ----------
+    categories : 'auto' or a list of array-like, default='auto'
+        Categories (unique values) per feature:
+
+        - 'auto' : Determine categories automatically from the training data.
+        - list : `categories[i]` holds the categories expected in the ith
+          column. The passed categories should not mix strings and numeric
+          values within a single feature, and should be sorted in case of
+          numeric values in ascending order.
+
+        The used categories can be found in the `categories_` attribute.
+
+    Attributes
+    ----------
+    encodings_ : list of shape (n_features,) of ndarray
+        For feature `i`, `encodings_[i]` is the encoding matching the
+        categories listed in `categories_[i]`.
+
+    categories_ : list of shape (n_features,) of ndarray
+        The categories of each feature determined during fitting
+        (in order of the features in X and corresponding with the output
+        of :meth:`transform`).
+
+    encoding_mean_ : float
+        The overall mean of the target.
+
+    See Also
+    --------
+    sklearn.preprocessing.OrdinalEncoder : Performs an ordinal (integer)
+      encoding of the categorical features.
+    sklearn.preprocessing.OneHotEncoder : Performs a one-hot encoding of
+      categorical features.
+    """
+    def __init__(self, categories='auto'):
+        self.categories = categories
+
+    def fit(self, X, y):
+        """Fit the TargetRegressorEncoder to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        self : object
+        """
+        self._fit(X, y)
+        return self
+
+    def transform(self, X, y=None):
+        """Encodes X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to encode.
+
+        Returns
+        -------
+        X_out : ndarray of shape (n_samples, n_features)
+            Transformed input.
+        """
+        check_is_fitted(self)
+        X_int, X_known = self._transform(X, handle_unknown='ignore',
+                                         force_all_finite='allow-nan')
+        return self._transform_X_int(X_int, X_known)
+
+    def fit_transform(self, X, y):
+        """Fit the encoder and encodes `X`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to encode.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        X_out : ndarray of shape (n_samples, n_features)
+            Transformed input.
+        """
+        X_int, X_known = self._fit(X, y)
+        return self._transform_X_int(X_int, X_known)
+
+    def _transform_X_int(self, X_int, X_known):
+        """Transform integer encoded X. `X_int` and `X_known` are returned
+        by `_transform`.
+        """
+        out = np.empty_like(X_int, dtype=float)
+        for i, cat_encoding in enumerate(self.encodings_):
+            np.take(cat_encoding, X_int[:, i], out=out[:, i])
+            out[~X_known[:, i], i] = self.encoding_mean_
+        return out
+
+    def _fit(self, X, y):
+        """Fit the encoder"""
+        y = column_or_1d(y, warn=True)
+        fit_results = super()._fit(X, handle_unknown='ignore',
+                                   return_counts=True,
+                                   force_all_finite='allow-nan')
+        X_int, X_known = self._transform(X, handle_unknown='ignore',
+                                         force_all_finite='allow-nan')
+        # Makes sure unknown categories are not used fot fitting
+        X_int[~X_known] = -1
+        self.encoding_mean_ = y_mean = np.mean(y)
+
+        # y is constant the encoding will be the constant
+        if np.ptp(y) == 0.0:
+            self.encodings_ = [
+                np.full(len(cat), fill_value=y_mean, dtype=float)
+                for cat in self.categories_]
+            return X_int, X_known
+
+        y_variance = np.var(y)
+
+        n_samples, n_features = X_int.shape
+        # temporary location to store masks when encoding each category
+        tmp_mask = np.zeros(n_samples, dtype=bool)
+
+        cat_encodings = []
+        category_counts = fit_results['category_counts']
+
+        for i in range(n_features):
+            n_cats = len(self.categories_[i])
+            cat_means = np.zeros(n_cats, dtype=float)
+            cat_var_ratio = np.ones(n_cats, dtype=float)
+
+            for encoding in range(n_cats):
+                np.equal(X_int[:, i], encoding, out=tmp_mask)
+                y_tmp = y[tmp_mask]
+                if y_tmp.size:
+                    cat_means[encoding] = np.mean(y_tmp)
+                    cat_var_ratio[encoding] = np.var(y_tmp)
+
+            # partial-pooling estimates
+            cat_counts = category_counts[i]
+            cat_var_ratio /= y_variance
+
+            cat_encoded = cat_counts * cat_means + cat_var_ratio * y_mean
+            cat_encoded /= cat_counts + cat_var_ratio
+            cat_encodings.append(cat_encoded)
+
+        self.encodings_ = cat_encodings
+        return X_int, X_known
diff --git a/sklearn/preprocessing/tests/test_target_encoder.py b/sklearn/preprocessing/tests/test_target_encoder.py
new file mode 100644
index 0000000000000..e4493875e6783
--- /dev/null
+++ b/sklearn/preprocessing/tests/test_target_encoder.py
@@ -0,0 +1,180 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_allclose
+
+from sklearn.preprocessing import TargetRegressorEncoder
+
+
+@pytest.mark.parametrize('categories', [
+        np.array([0, 1, 2], dtype=int),
+        np.array(['cat', 'dog', 'snake'], dtype=object)
+])
+@pytest.mark.parametrize('seed', range(3))
+def test_regression(categories, seed):
+    # checks impact encoder for regression
+
+    X_int = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=int).T
+    X_input = categories[X_int]
+    n_samples = X_input.shape[0]
+    rng = np.random.RandomState(seed)
+    y = rng.uniform(low=-10, high=20, size=n_samples)
+
+    # multilevel partial pooling
+    y_sections = [y[:20], y[20:50], y[50:]]
+    cat_means = np.array([np.mean(sect) for sect in y_sections])
+    cat_variance = np.array([np.var(sect) for sect in y_sections])
+    cat_counts = np.array([20, 30, 40])
+
+    y_variance = np.var(y)
+    y_mean = np.mean(y)
+
+    # multilevel partial pooling directly
+    cat_encoded = ((cat_counts * cat_means / cat_variance +
+                    y_mean / y_variance))
+    cat_encoded /= (cat_counts / cat_variance + 1 / y_variance)
+
+    # shuffle
+    shuffled_idx = rng.permutation(n_samples)
+    X_input = X_input[shuffled_idx]
+    y = y[shuffled_idx]
+
+    enc = TargetRegressorEncoder().fit(X_input, y)
+
+    assert len(enc.encodings_) == 1
+    assert enc.encoding_mean_ == pytest.approx(y_mean)
+    assert_allclose(enc.encodings_[0], cat_encoded)
+
+    expected_encoding = np.take(cat_encoded, X_int[shuffled_idx, :])
+    X_trans = enc.transform(X_input)
+    assert_allclose(expected_encoding, X_trans)
+
+    # test on unknown category
+    X_trans = enc.transform(np.array([[5]], dtype=categories.dtype))
+    assert_allclose(X_trans, [[y_mean]])
+
+
+@pytest.mark.parametrize('categories', [
+    np.array([0, 1, 2], dtype=int),
+    np.array(['cat', 'dog', 'snake'], dtype=object),
+])
+def test_zero_variance_category(categories):
+    # When the target is constant for a given category, the category encoding
+    # should correspond to the that constant target values
+    X_int = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=int).T
+    X_input = categories[X_int]
+
+    # The target of the first category are constant and have no variance
+    y = np.array([10] * 20 + [-4] * 15 + [9] * 15 + [-6] * 30 + [25] * 10)
+
+    enc = TargetRegressorEncoder().fit(X_input, y)
+    X_test = np.array([[0]], dtype=int).T
+    X_input = categories[X_test]
+    X_trans = enc.transform(X_input)
+    assert_allclose(X_trans, [[10]])
+
+
+@pytest.mark.parametrize('categories', [
+    np.array([0, 1, 2], dtype=int),
+    np.array(['cat', 'dog', 'snake'], dtype=object),
+])
+def test_zero_variance_target(categories):
+    # if the target has zero variance, then the mean of the target is used.
+    X_int = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=int).T
+    X_input = categories[X_int]
+    n_samples = X_input.shape[0]
+
+    y = np.ones(n_samples)
+
+    enc = TargetRegressorEncoder()
+    X_trans = enc.fit_transform(X_input, y)
+    expected_trans = np.full((n_samples, 1), fill_value=y.mean(), dtype=float)
+    assert_allclose(X_trans, expected_trans)
+
+
+@pytest.mark.parametrize("X, categories", [
+    (
+        np.array([[0] * 10 + [1] * 10 + [3]],
+                 dtype=int).T,  # 3 is unknown
+        [[0, 1, 2]]),
+    (
+        np.array([['cat'] * 10 + ['dog'] * 10 + ['snake']],
+                 dtype=object).T,  # snake is unknown
+        [['dog', 'cat', 'cow']]
+    ),
+])
+def test_custom_categories(X, categories):
+    # Test custom categoires with known categories that is unseen during
+    # fit time
+    rng = np.random.RandomState(42)
+    y = rng.uniform(low=-10, high=20, size=X.shape[0])
+
+    enc = TargetRegressorEncoder(categories=categories)
+    X_trans = enc.fit_transform(X, y)
+
+    # The last element is unknown
+    assert_allclose(X_trans[-1], [y.mean()])
+
+    assert len(enc.encodings_) == 1
+    # known category that is unseen during fit time
+    assert enc.encodings_[0][-1] == pytest.approx(y.mean())
+
+
+@pytest.mark.parametrize('to_pandas', [True, False])
+def test_multiple_features_sanity(to_pandas):
+    X = np.array([
+        [1, 1],
+        [0, 1],
+        [1, 1],
+        [0, 1],
+        [1, 0],
+        [0, 1],
+        [1, 0],
+        [0, 0]
+    ], dtype=int)
+    y = np.array([0, 1, 2, 3, 4, 5, 10, 7])
+    y_mean = np.mean(y)
+
+    X_test = np.array([
+        [0, 1],
+        [1, 0],
+        [2, 10],  # unknown
+    ], dtype=int)
+
+    if to_pandas:
+        pd = pytest.importorskip('pandas')
+        # convert second feature to a object
+        X_obj = np.array(['cat', 'dog'], dtype=object)[X[:, 1]]
+        X = pd.DataFrame({
+            'feat0': X[:, 0], 'feat1': X_obj}, columns=['feat0', 'feat1']
+        )
+        X_test = pd.DataFrame({
+            'feat0': X_test[:, 0], 'feat1': ['dog', 'cat', 'snake']
+        })
+
+    # manually compute multilevel partial pooling
+    feat_0_cat_0_encoding = ((4 * 4. / 5. + 4. / 9.5) /
+                             (4 / 5. + 1 / 9.5))
+    feat_0_cat_1_encoding = ((4 * 4. / 14. + 4. / 9.5) /
+                             (4 / 14. + 1 / 9.5))
+
+    feat_1_cat_0_encoding = ((3 * 7. / 6. + 4. / 9.5) /
+                             (3 / 6. + 1 / 9.5))
+    feat_1_cat_1_encoding = ((5 * 2.2 / 2.96 + 4. / 9.5) /
+                             (5 / 2.96 + 1 / 9.5))
+
+    expected_encoding = [
+        [feat_0_cat_0_encoding, feat_0_cat_1_encoding],
+        [feat_1_cat_0_encoding, feat_1_cat_1_encoding]
+    ]
+
+    enc = TargetRegressorEncoder().fit(X, y)
+    assert_allclose(expected_encoding, enc.encodings_)
+    assert enc.encoding_mean_ == pytest.approx(y_mean)
+
+    X_trans = enc.transform(X_test)
+    X_trans_expected = np.array([
+       [feat_0_cat_0_encoding, feat_1_cat_1_encoding],
+       [feat_0_cat_1_encoding, feat_1_cat_0_encoding],
+       [y_mean, y_mean],  # unknown maps to y_mean
+    ])
+    assert_allclose(X_trans, X_trans_expected)
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index b43afa998698b..6e998320e75a8 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -1,10 +1,12 @@
+from contextlib import suppress
+from collections import Counter
 from typing import NamedTuple
 
 import numpy as np
 from . import is_scalar_nan
 
 
-def _unique(values, *, return_inverse=False):
+def _unique(values, *, return_inverse=False, return_counts=False):
     """Helper function to find unique values with support for python objects.
 
     Uses pure python method for object dtype, and numpy method for
@@ -18,6 +20,10 @@ def _unique(values, *, return_inverse=False):
     return_inverse : bool, default=False
         If True, also return the indices of the unique values.
 
+    return_count : bool, default=False
+        If True, also return the number of times each unique item appears in
+        values.
+
     Returns
     -------
     unique : ndarray
@@ -26,16 +32,35 @@ def _unique(values, *, return_inverse=False):
     unique_inverse : ndarray
         The indices to reconstruct the original array from the unique array.
         Only provided if `return_inverse` is True.
+
+    unique_counts : ndarray
+        The number of times each of the unique values comes up in the original
+        array. Only provided if `return_counts` is True.
     """
     if values.dtype == object:
-        return _unique_python(values, return_inverse=return_inverse)
+        return _unique_python(values, return_inverse=return_inverse,
+                              return_counts=return_counts)
     # numerical
-    out = np.unique(values, return_inverse=return_inverse)
+    return _unique_np(values, return_inverse=return_inverse,
+                      return_counts=return_counts)
+
+
+def _unique_np(values, return_inverse=False, return_counts=False):
+    """Helper function to find unique values for numpy arrays that correctly
+    accounts for nans. See `_unique` documentation for details."""
+    uniques = np.unique(values, return_inverse=return_inverse,
+                        return_counts=return_counts)
+
+    inverse, counts = None, None
+
+    if return_counts:
+        *uniques, counts = uniques
 
     if return_inverse:
-        uniques, inverse = out
-    else:
-        uniques = out
+        *uniques, inverse = uniques
+
+    if return_counts or return_inverse:
+        uniques = uniques[0]
 
     # np.unique will have duplicate missing values at the end of `uniques`
     # here we clip the nans and remove it from uniques
@@ -45,9 +70,19 @@ def _unique(values, *, return_inverse=False):
         if return_inverse:
             inverse[inverse > nan_idx] = nan_idx
 
+        if return_counts:
+            counts[nan_idx] = np.sum(counts[nan_idx:])
+            counts = counts[:nan_idx + 1]
+
+    ret = (uniques, )
+
     if return_inverse:
-        return uniques, inverse
-    return uniques
+        ret += (inverse, )
+
+    if return_counts:
+        ret += (counts, )
+
+    return ret[0] if len(ret) == 1 else ret
 
 
 class MissingValues(NamedTuple):
@@ -123,7 +158,7 @@ def _map_to_integer(values, uniques):
     return np.array([table[v] for v in values])
 
 
-def _unique_python(values, *, return_inverse):
+def _unique_python(values, *, return_inverse, return_counts):
     # Only used in `_uniques`, see docstring there for details
     try:
         uniques_set = set(values)
@@ -137,11 +172,15 @@ def _unique_python(values, *, return_inverse):
                        for t in set(type(v) for v in values))
         raise TypeError("Encoders require their input to be uniformly "
                         f"strings or numbers. Got {types}")
+    ret = (uniques, )
 
     if return_inverse:
-        return uniques, _map_to_integer(values, uniques)
+        ret += (_map_to_integer(values, uniques), )
 
-    return uniques
+    if return_counts:
+        ret += (_get_counts(values, uniques), )
+
+    return ret[0] if len(ret) == 1 else ret
 
 
 def _encode(values, *, uniques, check_unknown=True):
@@ -267,3 +306,53 @@ def is_valid(value):
     if return_mask:
         return diff, valid_mask
     return diff
+
+
+class _NaNCounter(Counter):
+    """Counter with support for nan values."""
+    def __init__(self, items):
+        super().__init__(self._generate_items(items))
+
+    def _generate_items(self, items):
+        """Generate items without nans. Stores the nan counts seperately."""
+        for item in items:
+            if not is_scalar_nan(item):
+                yield item
+                continue
+            if not hasattr(self, 'nan_count'):
+                self.nan_count = 0
+            self.nan_count += 1
+
+    def __missing__(self, key):
+        if hasattr(self, 'nan_count') and is_scalar_nan(key):
+            return self.nan_count
+        raise KeyError(key)
+
+
+def _get_counts(values, uniques):
+    """Get the count of each of the `uniques` in `values`. The counts will use
+    the order passed in by `uniques`.
+
+    For non-object dtypes, `uniques` is assumed to be sorted.
+    """
+    if values.dtype.kind in 'OU':
+        counter = _NaNCounter(values)
+        output = np.zeros(len(uniques), dtype=np.int64)
+        for i, item in enumerate(uniques):
+            with suppress(KeyError):
+                output[i] = counter[item]
+        return output
+
+    unique_values, counts = _unique_np(values, return_counts=True)
+    uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
+
+    # If there are nans, they will be mapped to the end.
+    if np.isnan(unique_values[-1]) and np.isnan(uniques[-1]):
+        uniques_in_values[-1] = True
+
+    unique_valid_indices = np.searchsorted(unique_values,
+                                           uniques[uniques_in_values])
+
+    output = np.zeros_like(uniques, dtype=np.int64)
+    output[uniques_in_values] = counts[unique_valid_indices]
+    return output
diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py
index 53c380e192341..5670449cd3d2d 100644
--- a/sklearn/utils/tests/test_encode.py
+++ b/sklearn/utils/tests/test_encode.py
@@ -7,23 +7,44 @@
 from sklearn.utils._encode import _unique
 from sklearn.utils._encode import _encode
 from sklearn.utils._encode import _check_unknown
+from sklearn.utils._encode import _get_counts
 
 
 @pytest.mark.parametrize(
         "values, expected",
         [(np.array([2, 1, 3, 1, 3], dtype='int64'),
           np.array([1, 2, 3], dtype='int64')),
+         (np.array([2, 1, np.nan, 1, np.nan], dtype='float32'),
+          np.array([1, 2, np.nan], dtype='float32')),
          (np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
           np.array(['a', 'b', 'c'], dtype=object)),
+         (np.array(['b', 'a', None, 'a', None], dtype=object),
+          np.array(['a', 'b', None], dtype=object)),
          (np.array(['b', 'a', 'c', 'a', 'c']),
           np.array(['a', 'b', 'c']))],
-        ids=['int64', 'object', 'str'])
+        ids=['int64', 'float32-nan', 'object',
+             'object-None', 'str'])
 def test_encode_util(values, expected):
     uniques = _unique(values)
     assert_array_equal(uniques, expected)
+
+    result, encoded = _unique(values, return_inverse=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+
     encoded = _encode(values, uniques=uniques)
     assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
 
+    result, counts = _unique(values, return_counts=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(counts, np.array([2, 1, 2]))
+
+    result, encoded, counts = _unique(values, return_inverse=True,
+                                      return_counts=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+    assert_array_equal(counts, np.array([2, 1, 2]))
+
 
 def test_encode_with_check_unknown():
     # test for the check_unknown parameter of _encode()
@@ -206,3 +227,24 @@ def test_check_unknown_with_both_missing_values():
     assert np.isnan(diff[1])
     assert_array_equal(valid_mask,
                        [False, True, True, True, False, False, False])
+
+
+@pytest.mark.parametrize("values, uniques, expected_counts", [
+    (np.array([1] * 10 + [2] * 4 + [3] * 15),
+     np.array([1, 2, 3]), [10, 4, 15]),
+    (np.array([1] * 10 + [2] * 4 + [3] * 15),
+     np.array([1, 2, 3, 5]), [10, 4, 15, 0]),
+    (np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
+     np.array([2, 3, np.nan]), [4, 15, 10]),
+    (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+     ['a', 'b', 'c'], [16, 4, 20]),
+    (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+     ['c', 'b', 'a'], [20, 4, 16]),
+    (np.array([np.nan] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+     ['c', np.nan, 'a'], [20, 4, 16]),
+    (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+     ['a', 'b', 'c', 'e'], [16, 4, 20, 0]),
+])
+def test_get_counts(values, uniques, expected_counts):
+    counts = _get_counts(values, uniques)
+    assert_array_equal(counts, expected_counts)