@@ -310,18 +310,18 @@ def score_estimator(estimator, df_test):
310
310
# The experimental data presents a long tail distribution for ``y``. In all
311
311
# models, we predict the expected frequency of a random variable, so we will
312
312
# have necessarily fewer extreme values than for the observed realizations of
313
- # that random variable. Additionally, the normal conditional distribution used
314
- # in ``Ridge`` and ``HistGradientBoostingRegressor`` has a constant variance,
315
- # while for the Poisson distribution used in ``PoissonRegressor``, the variance
316
- # is proportional to the predicted expected value.
313
+ # that random variable. Additionally, the normal distribution used in ``Ridge``
314
+ # and ``HistGradientBoostingRegressor`` has a constant variance, while for the
315
+ # Poisson distribution used in ``PoissonRegressor``, the variance is
316
+ # proportional to the predicted expected value.
317
317
#
318
318
# Thus, among the considered estimators, ``PoissonRegressor`` is a-priori
319
319
# better suited for modeling the long tail distribution of the non-negative
320
320
# data as compared to its ``Ridge`` counter-part.
321
321
#
322
322
# The ``HistGradientBoostingRegressor`` estimator has more flexibility and is
323
323
# able to predict higher expected values while still assuming a normal
324
- # conditional distribution with constant variance for the response variable.
324
+ # distribution with constant variance for the response variable.
325
325
#
326
326
# Evaluation of the calibration of predictions
327
327
# --------------------------------------------
@@ -389,6 +389,8 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
389
389
q , y_true_seg , y_pred_seg = _mean_frequency_by_risk_group (
390
390
y_true , y_pred , sample_weight = exposure , n_bins = 10 )
391
391
392
+ # Name of the model after the class of the estimator used in the last step
393
+ # of the pipeline.
392
394
model_name = model .steps [- 1 ][1 ].__class__ .__name__
393
395
print (f"Predicted number of claims by { model_name } : "
394
396
f"{ np .sum (y_pred * exposure ):.1f} " )
@@ -407,7 +409,8 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
407
409
408
410
###############################################################################
409
411
# The dummy regression model predicts on constant frequency. This model is not
410
- # discriminative at all but is none-the-less well calibrated.
412
+ # attribute the same tied rank to all samples but is none-the-less well
413
+ # calibrated.
411
414
#
412
415
# The ``Ridge`` regression model can predict very low expected frequencies that
413
416
# do not match the data. It can therefore severly under-estimate the risk for
@@ -422,13 +425,13 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
422
425
# claims in the test set while the other three models can approximately recover
423
426
# the total number of claims of the test portfolio.
424
427
#
425
- # Evaluation of the discriminative power
426
- # --------------------------------------
428
+ # Evaluation of the ranking power
429
+ # -------------------------------
427
430
#
428
431
# For some business applications, we are interested in the ability of the model
429
- # to discriminate the riskiest from the safest policyholders, irrespective of
430
- # the absolute value of the prediction. In this case, the model evaluation
431
- # would cast the problem as a ranking problem rather than a regression problem.
432
+ # to rank the riskiest from the safest policyholders, irrespective of the
433
+ # absolute value of the prediction. In this case, the model evaluation would
434
+ # cast the problem as a ranking problem rather than a regression problem.
432
435
#
433
436
# To compare the 3 models from this perspective, one can plot the fraction of
434
437
# the number of claims vs the fraction of exposure for test samples ordered by
@@ -485,8 +488,8 @@ def lorenz_curve(y_true, y_pred, exposure):
485
488
ax .legend (loc = "upper left" )
486
489
487
490
##############################################################################
488
- # As expected, the dummy regressor is unable to discriminate and therefore
489
- # performs the worst on this plot.
491
+ # As expected, the dummy regressor is unable to correctly rank the samples and
492
+ # therefore performs the worst on this plot.
490
493
#
491
494
# The tree-based model is significantly better at ranking policyholders by risk
492
495
# while the two linear models perform similarly.
@@ -507,11 +510,12 @@ def lorenz_curve(y_true, y_pred, exposure):
507
510
# Main takeaways
508
511
# --------------
509
512
#
510
- # - A ideal model is both well-calibrated and discriminative.
513
+ # - The performance of the models can be evaluted by their ability to yield
514
+ # well-calibrated predictions and a good ranking.
511
515
#
512
516
# - The Gini index reflects the ability of a model to rank predictions
513
517
# irrespective of their absolute values, and therefore only assess their
514
- # discriminative power.
518
+ # ranking power.
515
519
#
516
520
# - The calibration of the model can be assessed by plotting the mean observed
517
521
# value vs the mean predicted value on groups of test samples binned by
@@ -524,16 +528,16 @@ def lorenz_curve(y_true, y_pred, exposure):
524
528
# - Using the Poisson loss can correct this problem and lead to a
525
529
# well-calibrated linear model.
526
530
#
527
- # - Despite the improvement in calibration, the discriminative power of both
528
- # linear models are comparable and well below the discriminative power of the
529
- # Gradient Boosting Regression Trees.
531
+ # - Despite the improvement in calibration, the ranking power of both linear
532
+ # models are comparable and well below the ranking power of the Gradient
533
+ # Boosting Regression Trees.
530
534
#
531
535
# - The non-linear Gradient Boosting Regression Trees model does not seem to
532
536
# suffer from significant mis-calibration issues (despite the use of a least
533
537
# squares loss).
534
538
#
535
539
# - The Poisson deviance computed as an evaluation metric reflects both the
536
- # calibration and the discriminative power of the model but makes a linear
540
+ # calibration and the ranking power of the model but makes a linear
537
541
# assumption on the ideal relationship between the expected value of an the
538
542
# variance of the response variable.
539
543
#
0 commit comments