@@ -181,17 +181,18 @@ def score_estimator(estimator, df_test):
181
181
score_estimator (dummy , df_test )
182
182
183
183
##############################################################################
184
- # Linear models
185
- # -------------
184
+ # (Generalized) Linear models
185
+ # ---------------------------
186
186
#
187
- # We start by modeling the target variable with the (penalized) least squares
188
- # linear regression model. We use a low penalization as we expect such a linear
189
- # model to under-fit on such a large dataset.
187
+ # We start by modeling the target variable with the (l2 penalized) least
188
+ # squares linear regression model, more comonly known as Ridge regression. We
189
+ # use a low penalization as we expect such a linear model to under-fit on such
190
+ # a large dataset.
190
191
191
192
from sklearn .linear_model import Ridge
192
193
193
194
194
- ridge = Pipeline ([
195
+ ridge_glm = Pipeline ([
195
196
("preprocessor" , linear_model_preprocessor ),
196
197
("regressor" , Ridge (alpha = 1e-6 )),
197
198
]).fit (df_train , df_train ["Frequency" ],
@@ -206,39 +207,44 @@ def score_estimator(estimator, df_test):
206
207
# meta-estimator to map ``y_pred`` to a strictly positive domain.
207
208
208
209
print ("Ridge evaluation:" )
209
- score_estimator (ridge , df_test )
210
+ score_estimator (ridge_glm , df_test )
210
211
211
212
##############################################################################
212
213
# Next we fit the Poisson regressor on the target variable. We set the
213
- # regularization strength ``alpha`` to 1 over number of samples in oder to
214
- # mimic the Ridge regressor whose L2 penalty term scales differently with the
215
- # number of samples.
214
+ # regularization strength ``alpha`` to approximately 1e-6 over number of
215
+ # samples in oder to mimic the Ridge regressor whose L2 penalty term scales
216
+ # differently with the number of samples.
216
217
217
218
from sklearn .linear_model import PoissonRegressor
218
219
219
220
n_samples = df_train .shape [0 ]
220
221
221
- poisson = Pipeline ([
222
+ poisson_glm = Pipeline ([
222
223
("preprocessor" , linear_model_preprocessor ),
223
- ("regressor" , PoissonRegressor (alpha = 1e-6 / n_samples , max_iter = 1000 ))
224
+ ("regressor" , PoissonRegressor (alpha = 1e-12 , max_iter = 300 ))
224
225
])
225
- poisson .fit (df_train , df_train ["Frequency" ],
226
- regressor__sample_weight = df_train ["Exposure" ])
226
+ poisson_glm .fit (df_train , df_train ["Frequency" ],
227
+ regressor__sample_weight = df_train ["Exposure" ])
227
228
228
229
print ("PoissonRegressor evaluation:" )
229
- score_estimator (poisson , df_test )
230
+ score_estimator (poisson_glm , df_test )
230
231
231
232
##############################################################################
232
233
# Finally, we will consider a non-linear model, namely Gradient Boosting
233
234
# Regression Trees. Tree-based models do not require the categorical data to be
234
235
# one-hot encoded: instead, we can encode each category label with an arbitrary
235
236
# integer using :class:`preprocessing.OrdinalEncoder`. With this encoding, the
236
- # trees will treat the categorical features as ordered features, which
237
- # might not be always a desired behavior. However this effect is limited for
238
- # deep enough trees which are able to recover the categorical nature of the
237
+ # trees will treat the categorical features as ordered features, which might
238
+ # not be always a desired behavior. However this effect is limited for deep
239
+ # enough trees which are able to recover the categorical nature of the
239
240
# features. The main advantage of the :class:`preprocessing.OrdinalEncoder`
240
241
# over the :class:`preprocessing.OneHotEncoder` is that it will make training
241
242
# faster.
243
+ #
244
+ # Gradient Boosting also give the possibility to fit the trees with a Poisson
245
+ # loss (with an implicit log-link function) instead of the default
246
+ # least-squares loss. Here we only fit trees with the Poisson loss to keep this
247
+ # example concise.
242
248
243
249
from sklearn .experimental import enable_hist_gradient_boosting # noqa
244
250
from sklearn .ensemble import HistGradientBoostingRegressor
@@ -254,15 +260,16 @@ def score_estimator(estimator, df_test):
254
260
],
255
261
remainder = "drop" ,
256
262
)
257
- gbrt = Pipeline ([
263
+ poisson_gbrt = Pipeline ([
258
264
("preprocessor" , tree_preprocessor ),
259
- ("regressor" , HistGradientBoostingRegressor (max_leaf_nodes = 128 )),
265
+ ("regressor" , HistGradientBoostingRegressor (loss = "poisson" ,
266
+ max_leaf_nodes = 128 )),
260
267
])
261
- gbrt .fit (df_train , df_train ["Frequency" ],
262
- regressor__sample_weight = df_train ["Exposure" ])
268
+ poisson_gbrt .fit (df_train , df_train ["Frequency" ],
269
+ regressor__sample_weight = df_train ["Exposure" ])
263
270
264
- print ("Gradient Boosted Trees evaluation:" )
265
- score_estimator (gbrt , df_test )
271
+ print ("Poisson Gradient Boosted Trees evaluation:" )
272
+ score_estimator (poisson_gbrt , df_test )
266
273
267
274
268
275
##############################################################################
@@ -294,7 +301,7 @@ def score_estimator(estimator, df_test):
294
301
axes [row_idx , 0 ].set_ylim ([1e1 , 5e5 ])
295
302
axes [row_idx , 0 ].set_ylabel (label + " samples" )
296
303
297
- for idx , model in enumerate ([ridge , poisson , gbrt ]):
304
+ for idx , model in enumerate ([ridge_glm , poisson_glm , poisson_gbrt ]):
298
305
y_pred = model .predict (df )
299
306
300
307
pd .Series (y_pred ).hist (bins = np .linspace (- 1 , 4 , n_bins ),
@@ -311,17 +318,26 @@ def score_estimator(estimator, df_test):
311
318
# models, we predict the expected frequency of a random variable, so we will
312
319
# have necessarily fewer extreme values than for the observed realizations of
313
320
# that random variable. Additionally, the normal distribution used in ``Ridge``
314
- # and ``HistGradientBoostingRegressor`` has a constant variance, while for the
315
- # Poisson distribution used in ``PoissonRegressor ``, the variance is
321
+ # has a constant variance, while for the Poisson distribution used in
322
+ # ``PoissonRegressor`` and ``HistGradientBoostingRegressor ``, the variance is
316
323
# proportional to the predicted expected value.
317
324
#
318
- # Thus, among the considered estimators, ``PoissonRegressor`` is a-priori
319
- # better suited for modeling the long tail distribution of the non-negative
320
- # data as compared to its ``Ridge`` counter-part.
325
+ # Thus, among the considered estimators, ``PoissonRegressor`` and
326
+ # ````HistGradientBoostingRegressor```` are a-priori better suited for modeling
327
+ # the long tail distribution of the non-negative data as compared to the
328
+ # ``Ridge`` model which makes a wrong assumption on the distribution of the
329
+ # target variable.
330
+ #
331
+ # The ``HistGradientBoostingRegressor`` estimator has the most flexibility and
332
+ # is able to predict higher expected values.
321
333
#
322
- # The ``HistGradientBoostingRegressor`` estimator has more flexibility and is
323
- # able to predict higher expected values while still assuming a normal
324
- # distribution with constant variance for the response variable.
334
+ # Note that we could have used the least squares loss for the
335
+ # ``HistGradientBoostingRegressor`` model. This would wrongly assume a normal
336
+ # distribution the response variable as for the `Ridge` model and possibly also
337
+ # lead to slightly negative predictions. However the gradient boosted trees
338
+ # would still perform relatively well and in particular better than
339
+ # ``PoissonRegressor`` thanks to the flexibility of the trees combined with
340
+ # the large number of training samples.
325
341
#
326
342
# Evaluation of the calibration of predictions
327
343
# --------------------------------------------
@@ -382,25 +398,25 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
382
398
fig , ax = plt .subplots (nrows = 2 , ncols = 2 , figsize = (12 , 8 ))
383
399
plt .subplots_adjust (wspace = 0.3 )
384
400
385
- for axi , model in zip (ax .ravel (), [dummy , ridge , poisson , gbrt ]):
401
+ for axi , model in zip (ax .ravel (), [ridge_glm , poisson_glm , poisson_gbrt ,
402
+ dummy ]):
386
403
y_pred = model .predict (df_test )
387
404
y_true = df_test ["Frequency" ].values
388
405
exposure = df_test ["Exposure" ].values
389
406
q , y_true_seg , y_pred_seg = _mean_frequency_by_risk_group (
390
407
y_true , y_pred , sample_weight = exposure , n_bins = 10 )
391
408
392
- # Name of the model after the class of the estimator used in the last step
393
- # of the pipeline.
394
- model_name = model .steps [- 1 ][1 ].__class__ .__name__
395
- print (f"Predicted number of claims by { model_name } : "
409
+ # Name of the model after the estimator used in the last step of the
410
+ # pipeline.
411
+ print (f"Predicted number of claims by { model [- 1 ]} : "
396
412
f"{ np .sum (y_pred * exposure ):.1f} " )
397
413
398
414
axi .plot (q , y_pred_seg , marker = 'x' , linestyle = "--" , label = "predictions" )
399
415
axi .plot (q , y_true_seg , marker = 'o' , linestyle = "--" , label = "observations" )
400
416
axi .set_xlim (0 , 1.0 )
401
417
axi .set_ylim (0 , 0.5 )
402
418
axi .set (
403
- title = model [- 1 ]. __class__ . __name__ ,
419
+ title = model [- 1 ],
404
420
xlabel = 'Fraction of samples sorted by y_pred' ,
405
421
ylabel = 'Mean Frequency (y_pred)'
406
422
)
@@ -409,8 +425,8 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
409
425
410
426
###############################################################################
411
427
# The dummy regression model predicts a constant frequency. This model is not
412
- # attribute the same tied rank to all samples but is none-the-less well
413
- # calibrated.
428
+ # attribute the same tied rank to all samples but is none-the-less globally
429
+ # well calibrated (to estimate the mean frequency of the entire population) .
414
430
#
415
431
# The ``Ridge`` regression model can predict very low expected frequencies that
416
432
# do not match the data. It can therefore severly under-estimate the risk for
@@ -460,13 +476,12 @@ def lorenz_curve(y_true, y_pred, exposure):
460
476
461
477
fig , ax = plt .subplots (figsize = (8 , 8 ))
462
478
463
- for model in [dummy , ridge , poisson , gbrt ]:
479
+ for model in [dummy , ridge_glm , poisson_glm , poisson_gbrt ]:
464
480
y_pred = model .predict (df_test )
465
481
cum_exposure , cum_claims = lorenz_curve (df_test ["Frequency" ], y_pred ,
466
482
df_test ["Exposure" ])
467
483
gini = 1 - 2 * auc (cum_exposure , cum_claims )
468
- label = "{} (Gini: {:.2f})" .format (
469
- model [- 1 ].__class__ .__name__ , gini )
484
+ label = "{} (Gini: {:.2f})" .format (model [- 1 ], gini )
470
485
ax .plot (cum_exposure , cum_claims , linestyle = "-" , label = label )
471
486
472
487
# Oracle model: y_pred == y_test
@@ -499,7 +514,8 @@ def lorenz_curve(y_true, y_pred, exposure):
499
514
#
500
515
# This last point is expected due to the nature of the problem: the occurrence
501
516
# of accidents is mostly dominated by circumstantial causes that are not
502
- # captured in the columns of the dataset or that are indeed random.
517
+ # captured in the columns of the dataset and can indeed be considered as purely
518
+ # random.
503
519
#
504
520
# The linear models assume no interactions between the input variables which
505
521
# likely causes under-fitting. Inserting a polynomial feature extractor
0 commit comments