|
| 1 | +""" |
| 2 | +============================================ |
| 3 | +Comparing Target Encoder with Other Encoders |
| 4 | +============================================ |
| 5 | +
|
| 6 | +.. currentmodule:: sklearn.preprocessing |
| 7 | +
|
| 8 | +The :class:`TargetEncoder` uses the value of the target to encode each |
| 9 | +categorical feature. In this example, we will compare three different approaches |
| 10 | +for handling categorical features: :class:`TargetEncoder`, |
| 11 | +:class:`OrdinalEncoder`, :class:`OneHotEncoder` and dropping the category. |
| 12 | +
|
| 13 | +.. note:: |
| 14 | + `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a |
| 15 | + cross-validation scheme is used in `fit_transform` for encoding. See the |
| 16 | + :ref:`User Guide <target_encoder>`. for details. |
| 17 | +""" |
| 18 | + |
| 19 | +# %% |
| 20 | +# Loading Data from OpenML |
| 21 | +# ======================== |
| 22 | +# First, we load the wine reviews dataset, where the target is the points given |
| 23 | +# be a reviewer: |
| 24 | +from sklearn.datasets import fetch_openml |
| 25 | + |
| 26 | +wine_reviews = fetch_openml(data_id=42074, as_frame=True, parser="pandas") |
| 27 | + |
| 28 | +df = wine_reviews.frame |
| 29 | +df.head() |
| 30 | + |
| 31 | +# %% |
| 32 | +# For this example, we use the following subset of numerical and categorical |
| 33 | +# features in the data. The target are continuous values from 80 to 100: |
| 34 | +numerical_features = ["price"] |
| 35 | +categorical_features = [ |
| 36 | + "country", |
| 37 | + "province", |
| 38 | + "region_1", |
| 39 | + "region_2", |
| 40 | + "variety", |
| 41 | + "winery", |
| 42 | +] |
| 43 | +target_name = "points" |
| 44 | + |
| 45 | +X = df[numerical_features + categorical_features] |
| 46 | +y = df[target_name] |
| 47 | + |
| 48 | +_ = y.hist() |
| 49 | + |
| 50 | +# %% |
| 51 | +# Training and Evaluating Pipelines with Different Encoders |
| 52 | +# ========================================================= |
| 53 | +# In this section, we will evaluate pipelines with |
| 54 | +# :class:`~sklearn.ensemble.HistGradientBoostingRegressor` with different encoding |
| 55 | +# strategies. First, we list out the encoders we will be using to preprocess |
| 56 | +# the categorical features: |
| 57 | +from sklearn.compose import ColumnTransformer |
| 58 | +from sklearn.preprocessing import OrdinalEncoder |
| 59 | +from sklearn.preprocessing import OneHotEncoder |
| 60 | +from sklearn.preprocessing import TargetEncoder |
| 61 | + |
| 62 | +categorical_preprocessors = [ |
| 63 | + ("drop", "drop"), |
| 64 | + ("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)), |
| 65 | + ( |
| 66 | + "one_hot", |
| 67 | + OneHotEncoder(handle_unknown="ignore", max_categories=20, sparse_output=False), |
| 68 | + ), |
| 69 | + ("target", TargetEncoder(target_type="continuous")), |
| 70 | +] |
| 71 | + |
| 72 | +# %% |
| 73 | +# Next, we evaluate the models using cross validation and record the results: |
| 74 | +from sklearn.pipeline import make_pipeline
10000
|
| 75 | +from sklearn.model_selection import cross_validate |
| 76 | +from sklearn.ensemble import HistGradientBoostingRegressor |
| 77 | + |
| 78 | +n_cv_folds = 3 |
| 79 | +max_iter = 20 |
| 80 | +results = [] |
| 81 | + |
| 82 | + |
| 83 | +def evaluate_model_and_store(name, pipe): |
| 84 | + result = cross_validate( |
| 85 | + pipe, |
| 86 | + X, |
| 87 | + y, |
| 88 | + scoring="neg_root_mean_squared_error", |
| 89 | + cv=n_cv_folds, |
| 90 | + return_train_score=True, |
| 91 | + ) |
| 92 | + rmse_test_score = -result["test_score"] |
| 93 | + rmse_train_score = -result["train_score"] |
| 94 | + results.append( |
| 95 | + { |
| 96 | + "preprocessor": name, |
| 97 | + "rmse_test_mean": rmse_test_score.mean(), |
| 98 | + "rmse_test_std": rmse_train_score.std(), |
| 99 | + "rmse_train_mean": rmse_train_score.mean(), |
| 100 | + "rmse_train_std": rmse_train_score.std(), |
| 101 | + } |
| 102 | + ) |
| 103 | + |
| 104 | + |
| 105 | +for name, categorical_preprocessor in categorical_preprocessors: |
| 106 | + preprocessor = ColumnTransformer( |
| 107 | + [ |
| 108 | + ("numerical", "passthrough", numerical_features), |
| 109 | + ("categorical", categorical_preprocessor, categorical_features), |
| 110 | + ] |
| 111 | + ) |
| 112 | + pipe = make_pipeline( |
| 113 | + preprocessor, HistGradientBoostingRegressor(random_state=0, max_iter=max_iter) |
| 114 | + ) |
| 115 | + evaluate_model_and_store(name, pipe) |
| 116 | + |
| 117 | + |
| 118 | +# %% |
| 119 | +# Native Categorical Feature Support |
| 120 | +# ================================== |
| 121 | +# In this section, we build and evaluate a pipeline that uses native categorical |
| 122 | +# feature support in :class:`~sklearn.ensemble.HistGradientBoostingRegressor`, |
| 123 | +# which only supports up to 255 unique categories. In our dataset, the most of |
| 124 | +# the categorical features have more than 255 unique categories: |
| 125 | +n_unique_categories = df[categorical_features].nunique().sort_values(ascending=False) |
| 126 | +n_unique_categories |
| 127 | + |
| 128 | +# %% |
| 129 | +# To workaround the limitation above, we group the categorical features into |
| 130 | +# low cardinality and high cardinality features. The high cardinality features |
| 131 | +# will be target encoded and the low cardinality features will use the native |
| 132 | +# categorical feature in gradient boosting. |
| 133 | +high_cardinality_features = n_unique_categories[n_unique_categories > 255].index |
| 134 | +low_cardinality_features = n_unique_categories[n_unique_categories <= 255].index |
| 135 | +mixed_encoded_preprocessor = ColumnTransformer( |
| 136 | + [ |
| 137 | + ("numerical", "passthrough", numerical_features), |
| 138 | + ( |
| 139 | + "high_cardinality", |
| 140 | + TargetEncoder(target_type="continuous"), |
| 141 | + high_cardinality_features, |
| 142 | + ), |
| 143 | + ( |
| 144 | + "low_cardinality", |
| 145 | + OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), |
| 146 | + low_cardinality_features, |
| 147 | + ), |
| 148 | + ], |
| 149 | + verbose_feature_names_out=False, |
| 150 | +) |
| 151 | + |
| 152 | +# The output of the of the preprocessor must be set to pandas so the |
| 153 | +# gradient boosting model can detect the low cardinality features. |
| 154 | +mixed_encoded_preprocessor.set_output(transform="pandas") |
| 155 | +mixed_pipe = make_pipeline( |
| 156 | + mixed_encoded_preprocessor, |
| 157 | + HistGradientBoostingRegressor( |
| 158 | + random_state=0, max_iter=max_iter, categorical_features=low_cardinality_features |
| 159 | + ), |
| 160 | +) |
| 161 | +mixed_pipe |
| 162 | + |
| 163 | +# %% |
| 164 | +# Finally, we evaluate the pipeline using cross validation and record the results: |
| 165 | +evaluate_model_and_store("mixed_target", mixed_pipe) |
| 166 | + |
| 167 | +# %% |
| 168 | +# Plotting the Results |
| 169 | +# ==================== |
| 170 | +# In this section, we display the results by plotting the test and train scores: |
| 171 | +import matplotlib.pyplot as plt |
| 172 | +import pandas as pd |
| 173 | + |
| 174 | +results_df = ( |
| 175 | + pd.DataFrame(results).set_index("preprocessor").sort_values("rmse_test_mean") |
| 176 | +) |
| 177 | + |
| 178 | +fig, (ax1, ax2) = plt.subplots( |
| 179 | + 1, 2, figsize=(12, 8), sharey=True, constrained_layout=True |
| 180 | +) |
| 181 | +xticks = range(len(results_df)) |
| 182 | +name_to_color = dict( |
| 183 | + zip((r["preprocessor"] for r in results), ["C0", "C1", "C2", "C3", "C4"]) |
| 184 | +) |
| 185 | + |
| 186 | +for subset, ax in zip(["test", "train"], [ax1, ax2]): |
| 187 | + mean, std = f"rmse_{subset}_mean", f"rmse_{subset}_std" |
| 188 | + data = results_df[[mean, std]].sort_values(mean) |
| 189 | + ax.bar( |
| 190 | + x=xticks, |
| 191 | + height=data[mean], |
| 192 | + yerr=data[std], |
| 193 | + width=0.9, |
| 194 | + color=[name_to_color[name] for name in data.index], |
| 195 | + ) |
| 196 | + ax.set( |
| 197 | + title=f"RMSE ({subset.title()})", |
| 198 | + xlabel="Encoding Scheme", |
| 199 | + xticks=xticks, |
| 200 | + xticklabels=data.index, |
| 201 | + ) |
| 202 | + |
| 203 | +# %% |
| 204 | +# When evaluating the predictive performance on the test set, dropping the |
| 205 | +# categories perform the worst and the target encoders performs the best. This |
| 206 | +# can be explained as follows: |
| 207 | +# |
| 208 | +# - Dropping the categorical features makes the pipeline less expressive and |
| 209 | +# underfitting as a result; |
| 210 | +# - Due to the high cardinality and to reduce the training time, the one-hot |
| 211 | +# encoding scheme uses `max_categories=20` which prevents the features
48DA
from |
| 212 | +# expanding too much, which can result in underfitting. |
| 213 | +# - If we had not set `max_categories=20`, the one-hot encoding scheme would have |
| 214 | +# likely made the pipeline overfitting as the number of features explodes with rare |
| 215 | +# category occurrences that are correlated with the target by chance (on the training |
| 216 | +# set only); |
| 217 | +# - The ordinal encoding imposes an arbitrary order to the features which are then |
| 218 | +# treated as numerical values by the |
| 219 | +# :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. Since this |
| 220 | +# model groups numerical features in 256 bins per feature, many unrelated categories |
| 221 | +# can be grouped together and as a result overall pipeline can underfit; |
| 222 | +# - When using the target encoder, the same binning happens, but since the encoded |
| 223 | +# values are statistically ordered by marginal association with the target variable, |
| 224 | +# the binning use by the :class:`~sklearn.ensemble.HistGradientBoostingRegressor` |
| 225 | +# makes sense and leads to good results: the combination of smoothed target |
| 226 | +# encoding and binning works as a good regularizing strategy against |
| 227 | +# overfitting while not limiting the expressiveness of the pipeline too much. |
0 commit comments