8000 Use Poisson GBRT · scikit-learn/scikit-learn@1fe4fb1 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1fe4fb1

Browse files
committed
Use Poisson GBRT
1 parent 039e87b commit 1fe4fb1

File tree

1 file changed

+61
-45
lines changed

1 file changed

+61
-45
lines changed

examples/linear_model/plot_poisson_regression_non_normal_loss.py

Lines changed: 61 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -181,17 +181,18 @@ def score_estimator(estimator, df_test):
181181
score_estimator(dummy, df_test)
182182

183183
##############################################################################
184-
# Linear models
185-
# -------------
184+
# (Generalized) Linear models
185+
# ---------------------------
186186
#
187-
# We start by modeling the target variable with the (penalized) least squares
188-
# linear regression model. We use a low penalization as we expect such a linear
189-
# model to under-fit on such a large dataset.
187+
# We start by modeling the target variable with the (l2 penalized) least
188+
# squares linear regression model, more comonly known as Ridge regression. We
189+
# use a low penalization as we expect such a linear model to under-fit on such
190+
# a large dataset.
190191

191192
from sklearn.linear_model import Ridge
192193

193194

194-
ridge = Pipeline([
195+
ridge_glm = Pipeline([
195196
("preprocessor", linear_model_preprocessor),
196197
("regressor", Ridge(alpha=1e-6)),
197198
]).fit(df_train, df_train["Frequency"],
@@ -206,39 +207,44 @@ def score_estimator(estimator, df_test):
206207
# meta-estimator to map ``y_pred`` to a strictly positive domain.
207208

208209
print("Ridge evaluation:")
209-
score_estimator(ridge, df_test)
210+
score_estimator(ridge_glm, df_test)
210211

211212
##############################################################################
212213
# Next we fit the Poisson regressor on the target variable. We set the
213-
# regularization strength ``alpha`` to 1 over number of samples in oder to
214-
# mimic the Ridge regressor whose L2 penalty term scales differently with the
215-
# number of samples.
214+
# regularization strength ``alpha`` to approximately 1e-6 over number of
215+
# samples in oder to mimic the Ridge regressor whose L2 penalty term scales
216+
# differently with the number of samples.
216217

217218
from sklearn.linear_model import PoissonRegressor
218219

219220
n_samples = df_train.shape[0]
220221

221-
poisson = Pipeline([
222+
poisson_glm = Pipeline([
222223
("preprocessor", linear_model_preprocessor),
223-
("regressor", PoissonRegressor(alpha=1e-6/n_samples, max_iter=1000))
224+
("regressor", PoissonRegressor(alpha=1e-12, max_iter=300))
224225
])
225-
poisson.fit(df_train, df_train["Frequency"],
226-
regressor__sample_weight=df_train["Exposure"])
226+
poisson_glm.fit(df_train, df_train["Frequency"],
227+
regressor__sample_weight=df_train["Exposure"])
227228

228229
print("PoissonRegressor evaluation:")
229-
score_estimator(poisson, df_test)
230+
score_estimator(poisson_glm, df_test)
230231

231232
##############################################################################
232233
# Finally, we will consider a non-linear model, namely Gradient Boosting
233234
# Regression Trees. Tree-based models do not require the categorical data to be
234235
# one-hot encoded: instead, we can encode each category label with an arbitrary
235236
# integer using :class:`preprocessing.OrdinalEncoder`. With this encoding, the
236-
# trees will treat the categorical features as ordered features, which
237-
# might not be always a desired behavior. However this effect is limited for
238-
# deep enough trees which are able to recover the categorical nature of the
237+
# trees will treat the categorical features as ordered features, which might
238+
# not be always a desired behavior. However this effect is limited for deep
239+
# enough trees which are able to recover the categorical nature of the
239240
# features. The main advantage of the :class:`preprocessing.OrdinalEncoder`
240241
# over the :class:`preprocessing.OneHotEncoder` is that it will make training
241242
# faster.
243+
#
244+
# Gradient Boosting also give the possibility to fit the trees with a Poisson
245+
# loss (with an implicit log-link function) instead of the default
246+
# least-squares loss. Here we only fit trees with the Poisson loss to keep this
247+
# example concise.
242248

243249
from sklearn.experimental import enable_hist_gradient_boosting # noqa
244250
from sklearn.ensemble import HistGradientBoostingRegressor
@@ -254,15 +260,16 @@ def score_estimator(estimator, df_test):
254260
],
255261
remainder="drop",
256262
)
257-
gbrt = Pipeline([
263+
poisson_gbrt = Pipeline([
258264
("preprocessor", tree_preprocessor),
259-
("regressor", HistGradientBoostingRegressor(max_leaf_nodes=128)),
265+
("regressor", HistGradientBoostingRegressor(loss="poisson",
266+
max_leaf_nodes=128)),
260267
])
261-
gbrt.fit(df_train, df_train["Frequency"],
262-
regressor__sample_weight=df_train["Exposure"])
268+
poisson_gbrt.fit(df_train, df_train["Frequency"],
269+
regressor__sample_weight=df_train["Exposure"])
263270

264-
print("Gradient Boosted Trees evaluation:")
265-
score_estimator(gbrt, df_test)
271+
print("Poisson Gradient Boosted Trees evaluation:")
272+
score_estimator(poisson_gbrt, df_test)
266273

267274

268275
##############################################################################
@@ -294,7 +301,7 @@ def score_estimator(estimator, df_test):
294301
axes[row_idx, 0].set_ylim([1e1, 5e5])
295302
axes[row_idx, 0].set_ylabel(label + " samples")
296303

297-
for idx, model in enumerate([ridge, poisson, gbrt]):
304+
for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]):
298305
y_pred = model.predict(df)
299306

300307
pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins),
@@ -311,17 +318,26 @@ def score_estimator(estimator, df_test):
311318
# models, we predict the expected frequency of a random variable, so we will
312319
# have necessarily fewer extreme values than for the observed realizations of
313320
# that random variable. Additionally, the normal distribution used in ``Ridge``
314-
# and ``HistGradientBoostingRegressor`` has a constant variance, while for the
315-
# Poisson distribution used in ``PoissonRegressor``, the variance is
321+
# has a constant variance, while for the Poisson distribution used in
322+
# ``PoissonRegressor`` and ``HistGradientBoostingRegressor``, the variance is
316323
# proportional to the predicted expected value.
317324
#
318-
# Thus, among the considered estimators, ``PoissonRegressor`` is a-priori
319-
# better suited for modeling the long tail distribution of the non-negative
320-
# data as compared to its ``Ridge`` counter-part.
325+
# Thus, among the considered estimators, ``PoissonRegressor`` and
326+
# ````HistGradientBoostingRegressor```` are a-priori better suited for modeling
327+
# the long tail distribution of the non-negative data as compared to the
328+
# ``Ridge`` model which makes a wrong assumption on the distribution of the
329+
# target variable.
330+
#
331+
# The ``HistGradientBoostingRegressor`` estimator has the most flexibility and
332+
# is able to predict higher expected values.
321333
#
322-
# The ``HistGradientBoostingRegressor`` estimator has more flexibility and is
323-
# able to predict higher expected values while still assuming a normal
324-
# distribution with constant variance for the response variable.
334+
# Note that we could have used the least squares loss for the
335+
# ``HistGradientBoostingRegressor`` model. This would wrongly assume a normal
336+
# distribution the response variable as for the `Ridge` model and possibly also
337+
# lead to slightly negative predictions. However the gradient boosted trees
338+
# would still perform relatively well and in particular better than
339+
# ``PoissonRegressor`` thanks to the flexibility of the trees combined with
340+
# the large number of training samples.
325341
#
326342
# Evaluation of the calibration of predictions
327343
# --------------------------------------------
@@ -382,25 +398,25 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
382398
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
383399
plt.subplots_adjust(wspace=0.3)
384400

385-
for axi, model in zip(ax.ravel(), [dummy, ridge, poisson, gbrt]):
401+
for axi, model in zip(ax.ravel(), [ridge_glm, poisson_glm, poisson_gbrt,
402+
dummy]):
386403
y_pred = model.predict(df_test)
387404
y_true = df_test["Frequency"].values
388405
exposure = df_test["Exposure"].values
389406
q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group(
390407
y_true, y_pred, sample_weight=exposure, n_bins=10)
391408

392-
# Name of the model after the class of the estimator used in the last step
393-
# of the pipeline.
394-
model_name = model.steps[-1][1].__class__.__name__
395-
print(f"Predicted number of claims by {model_name}: "
409+
# Name of the model after the estimator used in the last step of the
410+
# pipeline.
411+
print(f"Predicted number of claims by {model[-1]}: "
396412
f"{np.sum(y_pred * exposure):.1f}")
397413

398414
axi.plot(q, y_pred_seg, marker='x', linestyle="--", label="predictions")
399415
axi.plot(q, y_true_seg, marker='o', linestyle="--", label="observations")
400416
axi.set_xlim(0, 1.0)
401417
axi.set_ylim(0, 0.5)
402418
axi.set(
403-
title=model[-1].__class__.__name__,
419+
title=model[-1],
404420
xlabel='Fraction of samples sorted by y_pred',
405421
ylabel='Mean Frequency (y_pred)'
406422
)
@@ -409,8 +425,8 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
409425

410426
###############################################################################
411427
# The dummy regression model predicts a constant frequency. This model is not
412-
# attribute the same tied rank to all samples but is none-the-less well
413-
# calibrated.
428+
# attribute the same tied rank to all samples but is none-the-less globally
429+
# well calibrated (to estimate the mean frequency of the entire population).
414430
#
415431
# The ``Ridge`` regression model can predict very low expected frequencies that
416432
# do not match the data. It can therefore severly under-estimate the risk for
@@ -460,13 +476,12 @@ def lorenz_curve(y_true, y_pred, exposure):
460476

461477
fig, ax = plt.subplots(figsize=(8, 8))
462478

463-
for model in [dummy, ridge, poisson, gbrt]:
479+
for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]:
464480
y_pred = model.predict(df_test)
465481
cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], y_pred,
466482
df_test["Exposure"])
467483
gini = 1 - 2 * auc(cum_exposure, cum_claims)
468-
label = "{} (Gini: {:.2f})".format(
469-
model[-1].__class__.__name__, gini)
484+
label = "{} (Gini: {:.2f})".format(model[-1], gini)
470485
ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
471486

472487
# Oracle model: y_pred == y_test
@@ -499,7 +514,8 @@ def lorenz_curve(y_true, y_pred, exposure):
499514
#
500515
# This last point is expected due to the nature of the problem: the occurrence
501516
# of accidents is mostly dominated by circumstantial causes that are not
502-
# captured in the columns of the dataset or that are indeed random.
517+
# captured in the columns of the dataset and can indeed be considered as purely
518+
# random.
503519
#
504520
# The linear models assume no interactions between the input variables which
505521
# likely causes under-fitting. Inserting a polynomial feature extractor

0 commit comments

Comments
 (0)
0