From a7fc0ac77842a6d4ecc5a946380924a785425fed Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 5 Mar 2020 19:15:33 +0100 Subject: [PATCH 01/22] Improve claim prediction example --- examples/.flake8 | 2 +- ...plot_poisson_regression_non_normal_loss.py | 425 ++++++++++-------- 2 files changed, 251 insertions(+), 176 deletions(-) diff --git a/examples/.flake8 b/examples/.flake8 index 703bf15e79bff..ee5b4071c47ac 100644 --- a/examples/.flake8 +++ b/examples/.flake8 @@ -2,4 +2,4 @@ [flake8] # Same ignore as project-wide plus E402 (imports not at top of file) -ignore=E121,E123,E126,E24,E226,E704,W503,W504,E402 +ignore=E121,E123,E126,E24,E226,E704,W503,W504,E402,F401 diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 4b0386edfcdf6..e771b8dd4aad1 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -3,87 +3,80 @@ Poisson regression and non-normal loss ====================================== -This example illustrates the use of log-linear Poisson regression -on the `French Motor Third-Party Liability Claims dataset -`_ from [1]_ and compares -it with models learned with least squared error. In this dataset, each sample -corresponds to an insurance policy, i.e. a contract within an insurance -company and an individual (policiholder). Available features include driver -age, vehicle age, vehicle power, etc. - -A few definitions: a *claim* is the request made by a policyholder to the -insurer to compensate for a loss covered by the insurance. The *exposure* is -the duration of the insurance coverage of a given policy, in years. +This example illustrates the use of log-linear Poisson regression on the +`French Motor Third-Party Liability Claims dataset +`_ from [1]_ and compares it with models +learned with least squared error. In this dataset, each sample corresponds to +an insurance policy, i.e. a contract within an insurance company and an +individual (policiholder). Available features include driver age, vehicle age, +vehicle power, etc. + +A few definitions: + +- a **claim** is the request made by a policyholder to the insurer to + compensate for a loss covered by the insurance. + +- the **exposure** is the duration of the insurance coverage of a given policy, + in years. Our goal is to predict the expected number of insurance claims (or frequency) following car accidents for a policyholder given the historical data over a population of policyholders. .. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor - Third-Party Liability Claims (November 8, 2018). - `doi:10.2139/ssrn.3164764 `_ + Third-Party Liability Claims (November 8, 2018). `doi:10.2139/ssrn.3164764 + `_ """ print(__doc__) - # Authors: Christian Lorentzen # Roman Yurchak +# Olivier Grisel # License: BSD 3 clause import warnings - import numpy as np import matplotlib.pyplot as plt import pandas as pd -from sklearn.datasets import fetch_openml -from sklearn.dummy import DummyRegressor -from sklearn.compose import ColumnTransformer -from sklearn.linear_model import Ridge, PoissonRegressor -from sklearn.model_selection import train_test_split -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import FunctionTransformer, OneHotEncoder -from sklearn.preprocessing import OrdinalEncoder -from sklearn.preprocessing import StandardScaler, KBinsDiscretizer -from sklearn.ensemble import RandomForestRegressor -from sklearn.utils import gen_even_slices -from sklearn.metrics import auc - -from sklearn.metrics import mean_squared_error, mean_absolute_error -from sklearn.metrics import mean_poisson_deviance - -def load_mtpl2(n_samples=100000): - """Fetch the French Motor Third-Party Liability Claims dataset. - - Parameters - ---------- - n_samples: int or None, default=100000 - Number of samples to select (for faster run time). If None, the full - dataset with 678013 samples is returned. - """ +############################################################################## +# The French Motor Third-Party Liability Claims dataset +# ----------------------------------------------------- +# +# Let's load the motor claim dataset. We ignore the severity data for this +# study for the sake of simplicitly: - # freMTPL2freq dataset from https://www.openml.org/d/41214 - df = fetch_openml(data_id=41214, as_frame=True)['data'] +from sklearn.datasets import fetch_openml - # unquote string fields - for column_name in df.columns[df.dtypes.values == np.object]: - df[column_name] = df[column_name].str.strip("'") - if n_samples is not None: - return df.iloc[:n_samples] - return df +df = fetch_openml(data_id=41214, as_frame=True).frame +df ############################################################################## -# Let's load the motor claim dataset. We ignore the severity data for this -# study for the sake of simplicitly. +# The number of claims (``ClaimNb``) is a positive integer that can be modeled +# as a Poisson distribution. It is then assumed to be the number of discrete +# events occurring with a constant rate in a given time interval (``Exposure``, +# in units of years). # -# We also subsample the data for the sake of computational cost and running -# time. Using the full dataset would lead to similar conclusions. +# Here we want to model the frequency ``y = ClaimNb / Exposure`` via a (scaled) +# conditional Poisson distribution, and use ``Exposure`` as ``sample_weight``. -df = load_mtpl2(n_samples=300000) +df["Frequency"] = df["ClaimNb"] / df["Exposure"] -# Correct for unreasonable observations (that might be data error) -df["Exposure"] = df["Exposure"].clip(upper=1) +print("Average Frequency = {}" + .format(np.average(df["Frequency"], weights=df["Exposure"]))) + +print("Fraction of exposure with zero claims = {0:.1%}" + .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / + df["Exposure"].sum())) + +fig, (ax0, ax1, ax2) = plt.subplots(ncols=3, figsize=(16, 4)) +ax0.set_title("Number of claims") +_ = df["ClaimNb"].hist(bins=30, log=True, ax=ax0) +ax1.set_title("Exposure in years") +_ = df["Exposure"].hist(bins=30, log=True, ax=ax1) +ax2.set_title("Frequency (number of claims per year)") +_ = df["Frequency"].hist(bins=30, log=True, ax=ax2) ############################################################################## # The remaining columns can be used to predict the frequency of claim events. @@ -93,6 +86,12 @@ def load_mtpl2(n_samples=100000): # In order to fit linear models with those predictors it is therefore # necessary to perform standard feature transformations as follows: +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import StandardScaler, KBinsDiscretizer +from sklearn.compose import ColumnTransformer + + log_scale_transformer = make_pipeline( FunctionTransformer(np.log, validate=False), StandardScaler() @@ -112,29 +111,12 @@ def load_mtpl2(n_samples=100000): remainder="drop", ) -############################################################################## -# The number of claims (``ClaimNb``) is a positive integer that can be modeled -# as a Poisson distribution. It is then assumed to be the number of discrete -# events occurring with a constant rate in a given time interval -# (``Exposure``, in units of years). Here we model the frequency -# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, -# and use ``Exposure`` as ``sample_weight``. - -df["Frequency"] = df["ClaimNb"] / df["Exposure"] - -print( - pd.cut(df["Frequency"], [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() -) - -print("Average Frequency = {}" - .format(np.average(df["Frequency"], weights=df["Exposure"]))) - -print("Percentage of zero claims = {0:%}" - .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / - df["Exposure"].sum())) ############################################################################## -# It is worth noting that 92 % of policyholders have zero claims, and if we +# A constant prediction baseline +# ------------------------------ +# +# It is worth noting that 93% of policyholders have zero claims, and if we # were to convert this problem into a binary classification task, it would be # significantly imbalanced. # @@ -142,19 +124,31 @@ def load_mtpl2(n_samples=100000): # baseline a "dummy" estimator that constantly predicts the mean frequency of # the training sample. -df_train, df_test = train_test_split(df, random_state=0) +from sklearn.dummy import DummyRegressor +from sklearn.pipeline import Pipeline +from sklearn.model_selection import train_test_split -dummy = make_pipeline( - linear_model_preprocessor, - DummyRegressor(strategy='mean') -) +df_train, df_test = train_test_split(df, test_size=0.33, random_state=0) + +dummy = Pipeline([ + ("preprocessor", linear_model_preprocessor), + ("regressor", DummyRegressor(strategy='mean')), +]) dummy.fit(df_train, df_train["Frequency"], - dummyregressor__sample_weight=df_train["Exposure"]) + regressor__sample_weight=df_train["Exposure"]) + + +############################################################################## +# Let's compute the performance of this constant prediction baseline with 3 +# different regression metrics: + +from sklearn.metrics import mean_squared_error +from sklearn.metrics import mean_absolute_error +from sklearn.metrics import mean_poisson_deviance def score_estimator(estimator, df_test): """Score an estimator on the test set.""" - y_pred = estimator.predict(df_test) print("MSE: %.3f" % @@ -183,12 +177,20 @@ def score_estimator(estimator, df_test): score_estimator(dummy, df_test) ############################################################################## -# We start by modeling the target variable with the least squares linear -# regression model, +# Linear models +# ------------- +# +# We start by modeling the target variable with the (penalized) least squares +# linear regression model: -ridge = make_pipeline(linear_model_preprocessor, Ridge(alpha=1.0)) -ridge.fit(df_train, df_train["Frequency"], - ridge__sample_weight=df_train["Exposure"]) +from sklearn.linear_model import Ridge + + +ridge = Pipeline([ + ("preprocessor", linear_model_preprocessor), + ("regressor", Ridge(alpha=1e-6)), +]).fit(df_train, df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"]) ############################################################################## # The Poisson deviance cannot be computed on non-positive values predicted by @@ -207,28 +209,37 @@ def score_estimator(estimator, df_test): # mimic the Ridge regressor whose L2 penalty term scales differently with the # number of samples. -poisson = make_pipeline( - linear_model_preprocessor, - PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000) -) +from sklearn.linear_model import PoissonRegressor + + +poisson = Pipeline([ + ("preprocessor", linear_model_preprocessor), + ("regressor", PoissonRegressor(alpha=1e-6, max_iter=1000)) +]) poisson.fit(df_train, df_train["Frequency"], - poissonregressor__sample_weight=df_train["Exposure"]) + regressor__sample_weight=df_train["Exposure"]) print("PoissonRegressor evaluation:") score_estimator(poisson, df_test) ############################################################################## -# Finally, we will consider a non-linear model, namely a random forest. Random -# forests do not require the categorical data to be one-hot encoded: instead, -# we can encode each category label with an arbitrary integer using -# :class:`preprocessing.OrdinalEncoder`. With this encoding, the forest will -# treat the categorical features as ordered features, which might not be always -# a desired behavior. However this effect is limited for deep enough trees -# which are able to recover the categorical nature of the features. The main -# advantage of the :class:`preprocessing.OrdinalEncoder` over the -# :class:`preprocessing.OneHotEncoder` is that it will make training faster. - -rf_preprocessor = ColumnTransformer( +# Finally, we will consider a non-linear model, namely Gradient Boosting +# Regression Trees. Tree-based models do not require the categorical data to be +# one-hot encoded: instead, we can encode each category label with an arbitrary +# integer using :class:`preprocessing.OrdinalEncoder`. With this encoding, the +# the trees will treat the categorical features as ordered features, which +# might not be always a desired behavior. However this effect is limited for +# deep enough trees which are able to recover the categorical nature of the +# features. The main advantage of the :class:`preprocessing.OrdinalEncoder` +# over the :class:`preprocessing.OneHotEncoder` is that it will make training +# faster. + +from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.preprocessing import OrdinalEncoder + + +tree_preprocessor = ColumnTransformer( [ ("categorical", OrdinalEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), @@ -237,22 +248,21 @@ def score_estimator(estimator, df_test): ], remainder="drop", ) -rf = make_pipeline( - rf_preprocessor, - RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2) -) -rf.fit(df_train, df_train["Frequency"].values, - randomforestregressor__sample_weight=df_train["Exposure"].values) - +gbrt = Pipeline([ + ("preprocessor", tree_preprocessor), + ("regressor", HistGradientBoostingRegressor(max_leaf_nodes=128)), +]) +gbrt.fit(df_train, df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"]) -print("RandomForestRegressor evaluation:") -score_estimator(rf, df_test) +print("Gradient Boosted Trees evaluation:") +score_estimator(gbrt, df_test) ############################################################################## -# Like the Ridge regression above, the random forest model minimizes the -# conditional squared error, too. However, because of a higher predictive -# power, it also results in a smaller Poisson deviance than the Poisson +# Like the Ridge regression above, the gradient boosted trees model minimizes +# the conditional squared error. However, because of a higher predictive power, +# it also results in a smaller Poisson deviance than the linear Poisson # regression model. # # Evaluating models with a single train / test split is prone to random @@ -261,9 +271,9 @@ def score_estimator(estimator, df_test): # # The qualitative difference between these models can also be visualized by # comparing the histogram of observed target values with that of predicted -# values: +# values. -fig, axes = plt.subplots(2, 4, figsize=(16, 6), sharey=True) +fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 6), sharey=True) fig.subplots_adjust(bottom=0.2) n_bins = 20 for row_idx, label, df in zip(range(2), @@ -278,7 +288,7 @@ def score_estimator(estimator, df_test): axes[row_idx, 0].set_ylim([1e1, 5e5]) axes[row_idx, 0].set_ylabel(label + " samples") - for idx, model in enumerate([ridge, poisson, rf]): + for idx, model in enumerate([ridge, poisson, gbrt]): y_pred = model.predict(df) pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), @@ -292,21 +302,30 @@ def score_estimator(estimator, df_test): ############################################################################## # The experimental data presents a long tail distribution for ``y``. In all -# models we predict a mean expected value, so we will have necessarily fewer -# extreme values. Additionally, the normal distribution used in ``Ridge`` and -# ``RandomForestRegressor`` has a constant variance, while for the Poisson -# distribution used in ``PoissonRegressor``, the variance is proportional to -# the mean predicted value. +# models, we predict a mean expected value, so we will have necessarily fewer +# extreme values. Additionally, the normal conditional distribution used in +# ``Ridge`` and ``HistGradientBoostingRegressor`` has a constant variance, +# while for the Poisson distribution used in ``PoissonRegressor``, the variance +# is proportional to the mean predicted value. +# +# Thus, among the considered estimators, ``PoissonRegressor`` is a-priori +# better suited for modeling the long tail distribution of the data as compared +# to its ``Ridge`` counter-part. +# +# The ``HistGradientBoostingRegressor`` estimator has more flexibility and is +# able to predict higher mean predicted values while still minimizing a least +# squares loss. # -# Thus, among the considered estimators, ``PoissonRegressor`` is better suited -# for modeling the long tail distribution of the data as compared to the -# ``Ridge`` and ``RandomForestRegressor`` estimators. +# Evaluation of the calibration of predictions +# -------------------------------------------- # # To ensure that estimators yield reasonable predictions for different # policyholder types, we can bin test samples according to ``y_pred`` returned # by each model. Then for each bin, we compare the mean predicted ``y_pred``, # with the mean observed target: +from sklearn.utils import gen_even_slices + def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100): @@ -352,22 +371,25 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, return bin_centers, y_true_bin, y_pred_bin -fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5)) +print(f"Actual number of claims: {df_test['ClaimNb'].sum()}") +fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 8)) plt.subplots_adjust(wspace=0.3) -for axi, model in zip(ax, [ridge, poisson, rf]): +for axi, model in zip(ax.ravel(), [dummy, ridge, poisson, gbrt]): y_pred = model.predict(df_test) - + y_true = df_test["Frequency"].values + exposure = df_test["Exposure"].values q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( - df_test["Frequency"].values, - y_pred, - sample_weight=df_test["Exposure"].values, - n_bins=10) + y_true, y_pred, sample_weight=exposure, n_bins=10) + + model_name = model.steps[-1][1].__class__.__name__ + print(f"Predicted number of claims by {model_name}: " + f"{np.sum(y_pred * exposure):.1f}") - axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions") - axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations") + axi.plot(q, y_pred_seg, marker='x', linestyle="--", label="predictions") + axi.plot(q, y_true_seg, marker='o', linestyle="--", label="observations") axi.set_xlim(0, 1.0) - axi.set_ylim(0, 0.6) + axi.set_ylim(0, 0.5) axi.set( title=model[-1].__class__.__name__, xlabel='Fraction of samples sorted by y_pred', @@ -376,57 +398,73 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, axi.legend() plt.tight_layout() -############################################################################## -# The ``Ridge`` regression model can predict very low expected frequencies -# that do not match the data. It can therefore severly under-estimate the risk -# for some policyholders. +############################################################################### +# The dummy regression model predicts on constant frequency. This model is not +# discriminative at all but is none-the-less well calibrated. # -# ``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency -# between predicted and observed targets, especially for low predicted target -# values. +# The ``Ridge`` regression model can predict very low expected frequencies that +# do not match the data. It can therefore severly under-estimate the risk for +# some policyholders. +# +# ``PoissonRegressor`` and ``HistGradientBoostingRegressor`` show better +# consistency between predicted and observed targets, especially for low +# predicted target values. # -# However, for some business applications, we are not necessarily interested -# in the ability of the model to predict the expected frequency value, but -# instead to predict which policyholder groups are the riskiest and which are -# the safest. In this case, the model evaluation would cast the problem as a -# ranking problem rather than a regression problem. +# The sum of all predictions also confirms the calibration issue of the +# ``Ridge`` model that on average tend to under-estimate by more than 3% the +# total number of claims in the test set while the other three models can +# approximately recover the total number of claims of the test portfolio. +# +# Evaluation of the discriminative power +# -------------------------------------- +# +# For some business applications, we are interested in the ability of the model +# to discriminate the riskiest from the safest policyholders, irrespective of +# the absolute value of the prediction. In this case, the model evaluation +# would cast the problem as a ranking problem rather than a regression problem. # # To compare the 3 models within this perspective, one can plot the fraction of # the number of claims vs the fraction of exposure for test samples ordered by -# the model predictions, from safest to riskiest according to each model: +# the model predictions, from safest to riskiest according to each model. +# +# This plot is called a Lorenz curve and can be summarized by the Gini index: +from sklearn.metrics import auc -def _cumulated_claims(y_true, y_pred, exposure): - idx_sort = np.argsort(y_pred) # from safest to riskiest - sorted_exposure = exposure[idx_sort] - sorted_frequencies = y_true[idx_sort] - cumulated_exposure = np.cumsum(sorted_exposure) - cumulated_exposure /= cumulated_exposure[-1] - cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies) + +def lorenz_curve(y_true, y_pred, exposure): + y_true, y_pred = np.asarray(y_true), np.asarray(y_pred) + exposure = np.asarray(exposure) + + # order samples by increasing predicted risk: + ranking = np.argsort(y_pred) + ranked_exposure = exposure[ranking] + ranked_frequencies = y_true[ranking] + ranked_exposure = exposure[ranking] + cumulated_claims = np.cumsum(ranked_frequencies * ranked_exposure) cumulated_claims /= cumulated_claims[-1] + cumulated_exposure = np.cumsum(ranked_exposure) + cumulated_exposure /= cumulated_exposure[-1] return cumulated_exposure, cumulated_claims fig, ax = plt.subplots(figsize=(8, 8)) -for model in [ridge, poisson, rf]: +for model in [dummy, ridge, poisson, gbrt]: y_pred = model.predict(df_test) - cum_exposure, cum_claims = _cumulated_claims( - df_test["Frequency"].values, - y_pred, - df_test["Exposure"].values) - area = auc(cum_exposure, cum_claims) - label = "{} (area under curve: {:.3f})".format( - model[-1].__class__.__name__, area) + cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], y_pred, + df_test["Exposure"]) + gini = 1 - 2 * auc(cum_exposure, cum_claims) + label = "{} (Gini: {:.2f})".format( + model[-1].__class__.__name__, gini) ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test -cum_exposure, cum_claims = _cumulated_claims( - df_test["Frequency"].values, - df_test["Frequency"].values, - df_test["Exposure"].values) -area = auc(cum_exposure, cum_claims) -label = "Oracle (area under curve: {:.3f})".format(area) +cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], + df_test["Frequency"], + df_test["Exposure"]) +gini = 1 - 2 * auc(cum_exposure, cum_claims) +label = "Oracle (Gini: {:.2f})".format(gini) ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label) # Random Baseline @@ -440,10 +478,11 @@ def _cumulated_claims(y_true, y_pred, exposure): ax.legend(loc="upper left") ############################################################################## -# This plot reveals that the random forest model is slightly better at ranking -# policyholders by risk profiles even if the absolute value of the predicted -# expected frequencies are less well calibrated than for the linear Poisson -# model. +# As expected, the dummy regressor is unable to discriminate and therefore +# performs the worst on this plot. +# +# The tree-based model is significantly better at ranking policyholders by risk +# while the two linear models perform similarly. # # All three models are significantly better than chance but also very far from # making perfect predictions. @@ -451,5 +490,41 @@ def _cumulated_claims(y_true, y_pred, exposure): # This last point is expected due to the nature of the problem: the occurrence # of accidents is mostly dominated by circumstantial causes that are not # captured in the columns of the dataset or that are indeed random. +# +# Main takeaways +# -------------- +# +# - A ideal model is both well-calibrated and discriminative. +# +# - The Gini index reflects the ability of a model to rank predictions +# irrespective of their absolute values, and therefore only assess their +# discriminative power. +# +# - The calibration of the model can be assessed by plotting the mean observed +# value vs the mean predicted value on groups of test samples binned by +# predicted risk. +# +# - The least squares loss of the Ridge regression model seem to cause this +# model to be badly calibrated. In particular it tends to under estimate the +# risk and can even predict invalid negative frequencies... +# +# - Using the Poisson loss can correct this problem and lead to a +# well-calibrated linear model. +# +# - Despite the improvement in calibration, the discriminative power of both +# linear models are comparable and well below the discriminative power of the +# Gradient Boosting Regression Trees. +# +# - The non-linear Gradient Boosting Regression Trees model does not seem to +# suffer from significant mis-calibration issues (despite the use of a least +# squares loss). +# +# - The Poisson deviance computed as an evaluation metric reflects both the +# calibration and the discriminative power of the model but makes a linear +# assumption on the ideal relationship between the expected value of an the +# variance of the response variable. +# +# - Traditional regression metrics such as Mean Squared Error and Mean Absolute +# Error are hard to meaningfully interpret on count values. plt.show() From 0f67587ec125d5dbea6904734485eefe4a18d487 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 6 Mar 2020 09:27:59 +0100 Subject: [PATCH 02/22] Various cosmetic improvements --- ...plot_poisson_regression_non_normal_loss.py | 75 ++++++++++--------- 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index e771b8dd4aad1..6400180e96c2f 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -6,22 +6,28 @@ This example illustrates the use of log-linear Poisson regression on the `French Motor Third-Party Liability Claims dataset `_ from [1]_ and compares it with models -learned with least squared error. In this dataset, each sample corresponds to -an insurance policy, i.e. a contract within an insurance company and an -individual (policiholder). Available features include driver age, vehicle age, -vehicle power, etc. +learned with least squared error. A few definitions: +- a **policy** is a contract between an insurance company and an individual: + the **policyholder**, that is, the vehicle driver in this case. + - a **claim** is the request made by a policyholder to the insurer to compensate for a loss covered by the insurance. - the **exposure** is the duration of the insurance coverage of a given policy, in years. -Our goal is to predict the expected number of insurance claims (or frequency) -following car accidents for a policyholder given the historical data over a -population of policyholders. +- the claim **frequency** is the number of claims divided by the **exposure**, + typically measured in number of claims per year. + +In this dataset, each sample corresponds to an insurance policy. Available +features include driver age, vehicle age, vehicle power, etc. + +Our goal is to predict the expected frequency of claims following car accidents +for a new policyholder given the historical data over a population of +policyholders. .. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor Third-Party Liability Claims (November 8, 2018). `doi:10.2139/ssrn.3164764 @@ -116,9 +122,9 @@ # A constant prediction baseline # ------------------------------ # -# It is worth noting that 93% of policyholders have zero claims, and if we -# were to convert this problem into a binary classification task, it would be -# significantly imbalanced. +# It is worth noting that more than 93% of policyholders have zero claims. If +# we were to convert this problem into a binary classification task, it would +# be significantly imbalanced. # # To evaluate the pertinence of the used metrics, we will consider as a # baseline a "dummy" estimator that constantly predicts the mean frequency of @@ -133,9 +139,8 @@ dummy = Pipeline([ ("preprocessor", linear_model_preprocessor), ("regressor", DummyRegressor(strategy='mean')), -]) -dummy.fit(df_train, df_train["Frequency"], - regressor__sample_weight=df_train["Exposure"]) +]).fit(df_train, df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"]) ############################################################################## @@ -163,14 +168,13 @@ def score_estimator(estimator, df_test): mask = y_pred > 0 if (~mask).any(): warnings.warn("Estimator yields non-positive predictions for {} " - "samples out of {}. These will be ignored while " - "computing the Poisson deviance" + "samples out of {}. These predictions will be clipped " + " while computing the Poisson deviance" .format((~mask).sum(), mask.shape[0])) print("mean Poisson deviance: %.3f" % - mean_poisson_deviance(df_test["Frequency"][mask], - y_pred[mask], - df_test["Exposure"][mask])) + mean_poisson_deviance(df_test["Frequency"], y_pred.clip(min=1e-6), + df_test["Exposure"])) print("Constant mean frequency evaluation:") @@ -181,7 +185,8 @@ def score_estimator(estimator, df_test): # ------------- # # We start by modeling the target variable with the (penalized) least squares -# linear regression model: +# linear regression model. We use a low penalization as we expect such a linear +# model to under-fit on such a large dataset. from sklearn.linear_model import Ridge @@ -211,10 +216,11 @@ def score_estimator(estimator, df_test): from sklearn.linear_model import PoissonRegressor +n_samples = df_train.shape[0] poisson = Pipeline([ ("preprocessor", linear_model_preprocessor), - ("regressor", PoissonRegressor(alpha=1e-6, max_iter=1000)) + ("regressor", PoissonRegressor(alpha=1e-6/n_samples, max_iter=1000)) ]) poisson.fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) @@ -271,7 +277,7 @@ def score_estimator(estimator, df_test): # # The qualitative difference between these models can also be visualized by # comparing the histogram of observed target values with that of predicted -# values. +# values: fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 6), sharey=True) fig.subplots_adjust(bottom=0.2) @@ -302,19 +308,20 @@ def score_estimator(estimator, df_test): ############################################################################## # The experimental data presents a long tail distribution for ``y``. In all -# models, we predict a mean expected value, so we will have necessarily fewer -# extreme values. Additionally, the normal conditional distribution used in -# ``Ridge`` and ``HistGradientBoostingRegressor`` has a constant variance, +# models, we predict the expected frequency of a random variable, so we will +# have necessarily fewer extreme values than for the observed realizations of +# that random variable. Additionally, the normal conditional distribution used +# in ``Ridge`` and ``HistGradientBoostingRegressor`` has a constant variance, # while for the Poisson distribution used in ``PoissonRegressor``, the variance -# is proportional to the mean predicted value. +# is proportional to the predicted expected value. # # Thus, among the considered estimators, ``PoissonRegressor`` is a-priori -# better suited for modeling the long tail distribution of the data as compared -# to its ``Ridge`` counter-part. +# better suited for modeling the long tail distribution of the non-negative +# data as compared to its ``Ridge`` counter-part. # # The ``HistGradientBoostingRegressor`` estimator has more flexibility and is -# able to predict higher mean predicted values while still minimizing a least -# squares loss. +# able to predict higher expected values while still assuming a normal +# conditional distribution with constant variance for the response variable. # # Evaluation of the calibration of predictions # -------------------------------------------- @@ -411,9 +418,9 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # predicted target values. # # The sum of all predictions also confirms the calibration issue of the -# ``Ridge`` model that on average tend to under-estimate by more than 3% the -# total number of claims in the test set while the other three models can -# approximately recover the total number of claims of the test portfolio. +# ``Ridge`` model: it under-estimates by more than 3% the total number of +# claims in the test set while the other three models can approximately recover +# the total number of claims of the test portfolio. # # Evaluation of the discriminative power # -------------------------------------- @@ -423,7 +430,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # the absolute value of the prediction. In this case, the model evaluation # would cast the problem as a ranking problem rather than a regression problem. # -# To compare the 3 models within this perspective, one can plot the fraction of +# To compare the 3 models from this perspective, one can plot the fraction of # the number of claims vs the fraction of exposure for test samples ordered by # the model predictions, from safest to riskiest according to each model. # @@ -525,6 +532,6 @@ def lorenz_curve(y_true, y_pred, exposure): # variance of the response variable. # # - Traditional regression metrics such as Mean Squared Error and Mean Absolute -# Error are hard to meaningfully interpret on count values. +# Error are hard to meaningfully interpret on count values with many zeros. plt.show() From db81988a41d4a98679f3077368eaf09c655cbd18 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 6 Mar 2020 09:33:55 +0100 Subject: [PATCH 03/22] Note on poor performance of linear models --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 6400180e96c2f..c21f6b89dd3e9 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -489,7 +489,8 @@ def lorenz_curve(y_true, y_pred, exposure): # performs the worst on this plot. # # The tree-based model is significantly better at ranking policyholders by risk -# while the two linear models perform similarly. +# while the two linear models perform similarly. The linear models assume no +# interactions between the input variables which likely causes under-fitting. # # All three models are significantly better than chance but also very far from # making perfect predictions. From 65a314b6acbb25f32979ae120de3813f1cf03624 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 6 Mar 2020 09:54:41 +0100 Subject: [PATCH 04/22] Rephrase/improve linear model discriminative perf analysis --- .../plot_poisson_regression_non_normal_loss.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index c21f6b89dd3e9..53b726783846c 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -489,8 +489,7 @@ def lorenz_curve(y_true, y_pred, exposure): # performs the worst on this plot. # # The tree-based model is significantly better at ranking policyholders by risk -# while the two linear models perform similarly. The linear models assume no -# interactions between the input variables which likely causes under-fitting. +# while the two linear models perform similarly. # # All three models are significantly better than chance but also very far from # making perfect predictions. @@ -499,6 +498,12 @@ def lorenz_curve(y_true, y_pred, exposure): # of accidents is mostly dominated by circumstantial causes that are not # captured in the columns of the dataset or that are indeed random. # +# The linear models assume no interactions between the input variables which +# likely causes under-fitting. Inserting a polynomial feature extractor +# (func:`sklearn.preprocessing.PolynomialFeatures`) indeed increases their +# discrimative power by 2 points of Gini index. In particular it improves the +# ability of the models to identify the top 5% riskiest profiles. +# # Main takeaways # -------------- # From 02272607420df198bc8e6719bb8f26d8cf855290 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 6 Mar 2020 11:41:38 +0100 Subject: [PATCH 05/22] cosmetics [doc skip] [ci skip] --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 53b726783846c..75493070a9480 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -19,7 +19,7 @@ - the **exposure** is the duration of the insurance coverage of a given policy, in years. -- the claim **frequency** is the number of claims divided by the **exposure**, +- the claim **frequency** is the number of claims divided by the exposure, typically measured in number of claims per year. In this dataset, each sample corresponds to an insurance policy. Available From 01ec0e8b0e741e88803ece0452368f5e8c960df3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 27 Apr 2020 17:54:36 +0200 Subject: [PATCH 06/22] Revert change in example/.flake8 --- examples/.flake8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/.flake8 b/examples/.flake8 index ee5b4071c47ac..703bf15e79bff 100644 --- a/examples/.flake8 +++ b/examples/.flake8 @@ -2,4 +2,4 @@ [flake8] # Same ignore as project-wide plus E402 (imports not at top of file) -ignore=E121,E123,E126,E24,E226,E704,W503,W504,E402,F401 +ignore=E121,E123,E126,E24,E226,E704,W503,W504,E402 From b64ee3f449006a46c76e401a5a07c0e355532923 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 27 Apr 2020 18:01:11 +0200 Subject: [PATCH 07/22] Phrasing: model y conditionally on X --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 75493070a9480..a6d4b23bf9372 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -64,8 +64,9 @@ # events occurring with a constant rate in a given time interval (``Exposure``, # in units of years). # -# Here we want to model the frequency ``y = ClaimNb / Exposure`` via a (scaled) -# conditional Poisson distribution, and use ``Exposure`` as ``sample_weight``. +# Here we want to model the frequency ``y = ClaimNb / Exposure`` conditionally +# on ``X`` via a (scaled) Poisson distribution, and use ``Exposure`` as +# ``sample_weight``. df["Frequency"] = df["ClaimNb"] / df["Exposure"] From 449042480ed2551e0ed8f61939b15dc464f295a0 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 27 Apr 2020 18:06:12 +0200 Subject: [PATCH 08/22] Remove warning (use print instead) --- .../plot_poisson_regression_non_normal_loss.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index a6d4b23bf9372..28e2d635b71b1 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -39,7 +39,6 @@ # Roman Yurchak # Olivier Grisel # License: BSD 3 clause -import warnings import numpy as np import matplotlib.pyplot as plt import pandas as pd @@ -168,10 +167,10 @@ def score_estimator(estimator, df_test): # the Poisson deviance mask = y_pred > 0 if (~mask).any(): - warnings.warn("Estimator yields non-positive predictions for {} " - "samples out of {}. These predictions will be clipped " - " while computing the Poisson deviance" - .format((~mask).sum(), mask.shape[0])) + n_clipped, n_samples = (~mask).sum(), mask.shape[0] + print(f"WARNING: Estimator yields non-positive predictions for " + f"{n_clipped} samples out of {n_samples}. These predictions " + f"will be clipped while computing the Poisson deviance.") print("mean Poisson deviance: %.3f" % mean_poisson_deviance(df_test["Frequency"], y_pred.clip(min=1e-6), From fe4cc222b68cd9f0d7cc56751f6421d6805b0517 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 27 Apr 2020 18:07:14 +0200 Subject: [PATCH 09/22] Disable F401 for enable_hist_gradient_boosting explicitly --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 28e2d635b71b1..e43c1deab54c6 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -240,7 +240,7 @@ def score_estimator(estimator, df_test): # over the :class:`preprocessing.OneHotEncoder` is that it will make training # faster. -from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.preprocessing import OrdinalEncoder From f6cd00dc233c441ac5587f794757aace320fc878 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 27 Apr 2020 18:27:56 +0200 Subject: [PATCH 10/22] Do not use the term discriminative power which is overloaded/ambiguous --- ...plot_poisson_regression_non_normal_loss.py | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index e43c1deab54c6..fc4d6eb18bb43 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -310,10 +310,10 @@ def score_estimator(estimator, df_test): # The experimental data presents a long tail distribution for ``y``. In all # models, we predict the expected frequency of a random variable, so we will # have necessarily fewer extreme values than for the observed realizations of -# that random variable. Additionally, the normal conditional distribution used -# in ``Ridge`` and ``HistGradientBoostingRegressor`` has a constant variance, -# while for the Poisson distribution used in ``PoissonRegressor``, the variance -# is proportional to the predicted expected value. +# that random variable. Additionally, the normal distribution used in ``Ridge`` +# and ``HistGradientBoostingRegressor`` has a constant variance, while for the +# Poisson distribution used in ``PoissonRegressor``, the variance is +# proportional to the predicted expected value. # # Thus, among the considered estimators, ``PoissonRegressor`` is a-priori # better suited for modeling the long tail distribution of the non-negative @@ -321,7 +321,7 @@ def score_estimator(estimator, df_test): # # The ``HistGradientBoostingRegressor`` estimator has more flexibility and is # able to predict higher expected values while still assuming a normal -# conditional distribution with constant variance for the response variable. +# distribution with constant variance for the response variable. # # Evaluation of the calibration of predictions # -------------------------------------------- @@ -389,6 +389,8 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( y_true, y_pred, sample_weight=exposure, n_bins=10) + # Name of the model after the class of the estimator used in the last step + # of the pipeline. model_name = model.steps[-1][1].__class__.__name__ print(f"Predicted number of claims by {model_name}: " f"{np.sum(y_pred * exposure):.1f}") @@ -407,7 +409,8 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, ############################################################################### # The dummy regression model predicts on constant frequency. This model is not -# discriminative at all but is none-the-less well calibrated. +# attribute the same tied rank to all samples but is none-the-less well +# calibrated. # # The ``Ridge`` regression model can predict very low expected frequencies that # do not match the data. It can therefore severly under-estimate the risk for @@ -422,13 +425,13 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # claims in the test set while the other three models can approximately recover # the total number of claims of the test portfolio. # -# Evaluation of the discriminative power -# -------------------------------------- +# Evaluation of the ranking power +# ------------------------------- # # For some business applications, we are interested in the ability of the model -# to discriminate the riskiest from the safest policyholders, irrespective of -# the absolute value of the prediction. In this case, the model evaluation -# would cast the problem as a ranking problem rather than a regression problem. +# to rank the riskiest from the safest policyholders, irrespective of the +# absolute value of the prediction. In this case, the model evaluation would +# cast the problem as a ranking problem rather than a regression problem. # # To compare the 3 models from this perspective, one can plot the fraction of # the number of claims vs the fraction of exposure for test samples ordered by @@ -485,8 +488,8 @@ def lorenz_curve(y_true, y_pred, exposure): ax.legend(loc="upper left") ############################################################################## -# As expected, the dummy regressor is unable to discriminate and therefore -# performs the worst on this plot. +# As expected, the dummy regressor is unable to correctly rank the samples and +# therefore performs the worst on this plot. # # The tree-based model is significantly better at ranking policyholders by risk # while the two linear models perform similarly. @@ -507,11 +510,12 @@ def lorenz_curve(y_true, y_pred, exposure): # Main takeaways # -------------- # -# - A ideal model is both well-calibrated and discriminative. +# - The performance of the models can be evaluted by their ability to yield +# well-calibrated predictions and a good ranking. # # - The Gini index reflects the ability of a model to rank predictions # irrespective of their absolute values, and therefore only assess their -# discriminative power. +# ranking power. # # - The calibration of the model can be assessed by plotting the mean observed # value vs the mean predicted value on groups of test samples binned by @@ -524,16 +528,16 @@ def lorenz_curve(y_true, y_pred, exposure): # - Using the Poisson loss can correct this problem and lead to a # well-calibrated linear model. # -# - Despite the improvement in calibration, the discriminative power of both -# linear models are comparable and well below the discriminative power of the -# Gradient Boosting Regression Trees. +# - Despite the improvement in calibration, the ranking power of both linear +# models are comparable and well below the ranking power of the Gradient +# Boosting Regression Trees. # # - The non-linear Gradient Boosting Regression Trees model does not seem to # suffer from significant mis-calibration issues (despite the use of a least # squares loss). # # - The Poisson deviance computed as an evaluation metric reflects both the -# calibration and the discriminative power of the model but makes a linear +# calibration and the ranking power of the model but makes a linear # assumption on the ideal relationship between the expected value of an the # variance of the response variable. # From aa60673820e29b7633b49e9b28adc168560c34b2 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 27 Apr 2020 18:41:37 +0200 Subject: [PATCH 11/22] Apply suggestions from code review Co-Authored-By: Christian Lorentzen --- .../plot_poisson_regression_non_normal_loss.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index fc4d6eb18bb43..b3f6bc7d14a78 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -10,13 +10,13 @@ A few definitions: -- a **policy** is a contract between an insurance company and an individual: +- A **policy** is a contract between an insurance company and an individual: the **policyholder**, that is, the vehicle driver in this case. -- a **claim** is the request made by a policyholder to the insurer to +- A **claim** is the request made by a policyholder to the insurer to compensate for a loss covered by the insurance. -- the **exposure** is the duration of the insurance coverage of a given policy, +- The **exposure** is the duration of the insurance coverage of a given policy, in years. - the claim **frequency** is the number of claims divided by the exposure, @@ -233,7 +233,7 @@ def score_estimator(estimator, df_test): # Regression Trees. Tree-based models do not require the categorical data to be # one-hot encoded: instead, we can encode each category label with an arbitrary # integer using :class:`preprocessing.OrdinalEncoder`. With this encoding, the -# the trees will treat the categorical features as ordered features, which +# trees will treat the categorical features as ordered features, which # might not be always a desired behavior. However this effect is limited for # deep enough trees which are able to recover the categorical nature of the # features. The main advantage of the :class:`preprocessing.OrdinalEncoder` @@ -408,7 +408,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, plt.tight_layout() ############################################################################### -# The dummy regression model predicts on constant frequency. This model is not +# The dummy regression model predicts a constant frequency. This model is not # attribute the same tied rank to all samples but is none-the-less well # calibrated. # @@ -525,7 +525,7 @@ def lorenz_curve(y_true, y_pred, exposure): # model to be badly calibrated. In particular it tends to under estimate the # risk and can even predict invalid negative frequencies... # -# - Using the Poisson loss can correct this problem and lead to a +# - Using the Poisson loss with a log-link can correct these problems and lead to a # well-calibrated linear model. # # - Despite the improvement in calibration, the ranking power of both linear @@ -538,7 +538,7 @@ def lorenz_curve(y_true, y_pred, exposure): # # - The Poisson deviance computed as an evaluation metric reflects both the # calibration and the ranking power of the model but makes a linear -# assumption on the ideal relationship between the expected value of an the +# assumption on the ideal relationship between the expected value and the # variance of the response variable. # # - Traditional regression metrics such as Mean Squared Error and Mean Absolute From e3c382deb05cfa77e3dbaa79f18a42c36f2abd3a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 27 Apr 2020 19:23:56 +0200 Subject: [PATCH 12/22] Fix pep8 --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index b3f6bc7d14a78..0c729b19fd887 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -525,8 +525,8 @@ def lorenz_curve(y_true, y_pred, exposure): # model to be badly calibrated. In particular it tends to under estimate the # risk and can even predict invalid negative frequencies... # -# - Using the Poisson loss with a log-link can correct these problems and lead to a -# well-calibrated linear model. +# - Using the Poisson loss with a log-link can correct these problems and lead +# to a well-calibrated linear model. # # - Despite the improvement in calibration, the ranking power of both linear # models are comparable and well below the ranking power of the Gradient From 8347811f8a5b1747f97c9138d383855ebde4468c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 27 Apr 2020 19:57:33 +0200 Subject: [PATCH 13/22] Mention identity link function + linear variance function assumption --- .../plot_poisson_regression_non_normal_loss.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 0c729b19fd887..c632199add159 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -521,9 +521,10 @@ def lorenz_curve(y_true, y_pred, exposure): # value vs the mean predicted value on groups of test samples binned by # predicted risk. # -# - The least squares loss of the Ridge regression model seem to cause this -# model to be badly calibrated. In particular it tends to under estimate the -# risk and can even predict invalid negative frequencies... +# - The least squares loss (along with the implicit use of the identity link +# function) of the Ridge regression model seems to cause this model to be +# badly calibrated. In particular it tends to under estimate the risk and can +# even predict invalid negative frequencies. # # - Using the Poisson loss with a log-link can correct these problems and lead # to a well-calibrated linear model. @@ -537,9 +538,10 @@ def lorenz_curve(y_true, y_pred, exposure): # squares loss). # # - The Poisson deviance computed as an evaluation metric reflects both the -# calibration and the ranking power of the model but makes a linear +# calibration and the ranking power of the model. It also makes a linear # assumption on the ideal relationship between the expected value and the -# variance of the response variable. +# variance of the response variable. For the sake of conciseness we did not +# check whether this assumption holds. # # - Traditional regression metrics such as Mean Squared Error and Mean Absolute # Error are hard to meaningfully interpret on count values with many zeros. From e0121392e2243f4db3fe189b666218e23b8d56ad Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 27 Apr 2020 19:59:23 +0200 Subject: [PATCH 14/22] Fix kwonly deprecation warning in scoring functions --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index c632199add159..ab713813d8875 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -158,10 +158,10 @@ def score_estimator(estimator, df_test): print("MSE: %.3f" % mean_squared_error(df_test["Frequency"], y_pred, - df_test["Exposure"])) + sample_weight=df_test["Exposure"])) print("MAE: %.3f" % mean_absolute_error(df_test["Frequency"], y_pred, - df_test["Exposure"])) + sample_weight=df_test["Exposure"])) # ignore non-positive predictions, as they are invalid for # the Poisson deviance @@ -174,7 +174,7 @@ def score_estimator(estimator, df_test): print("mean Poisson deviance: %.3f" % mean_poisson_deviance(df_test["Frequency"], y_pred.clip(min=1e-6), - df_test["Exposure"])) + sample_weight=df_test["Exposure"])) print("Constant mean frequency evaluation:") From 039e87bb5f00d76e7ac2ab965f9ea17e33aeb863 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 28 Apr 2020 09:30:23 +0200 Subject: [PATCH 15/22] Update examples/linear_model/plot_poisson_regression_non_normal_loss.py Co-Authored-By: Christian Lorentzen --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index ab713813d8875..b06d430463b3e 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -523,7 +523,7 @@ def lorenz_curve(y_true, y_pred, exposure): # # - The least squares loss (along with the implicit use of the identity link # function) of the Ridge regression model seems to cause this model to be -# badly calibrated. In particular it tends to under estimate the risk and can +# badly calibrated. In particular, it tends to underestimate the risk and can # even predict invalid negative frequencies. # # - Using the Poisson loss with a log-link can correct these problems and lead From 1fe4fb1e608496a11706f1bc97ee1ca84901bc31 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 28 Apr 2020 12:23:47 +0200 Subject: [PATCH 16/22] Use Poisson GBRT --- ...plot_poisson_regression_non_normal_loss.py | 106 ++++++++++-------- 1 file changed, 61 insertions(+), 45 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index b06d430463b3e..bf6b63a83469a 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -181,17 +181,18 @@ def score_estimator(estimator, df_test): score_estimator(dummy, df_test) ############################################################################## -# Linear models -# ------------- +# (Generalized) Linear models +# --------------------------- # -# We start by modeling the target variable with the (penalized) least squares -# linear regression model. We use a low penalization as we expect such a linear -# model to under-fit on such a large dataset. +# We start by modeling the target variable with the (l2 penalized) least +# squares linear regression model, more comonly known as Ridge regression. We +# use a low penalization as we expect such a linear model to under-fit on such +# a large dataset. from sklearn.linear_model import Ridge -ridge = Pipeline([ +ridge_glm = Pipeline([ ("preprocessor", linear_model_preprocessor), ("regressor", Ridge(alpha=1e-6)), ]).fit(df_train, df_train["Frequency"], @@ -206,39 +207,44 @@ def score_estimator(estimator, df_test): # meta-estimator to map ``y_pred`` to a strictly positive domain. print("Ridge evaluation:") -score_estimator(ridge, df_test) +score_estimator(ridge_glm, df_test) ############################################################################## # Next we fit the Poisson regressor on the target variable. We set the -# regularization strength ``alpha`` to 1 over number of samples in oder to -# mimic the Ridge regressor whose L2 penalty term scales differently with the -# number of samples. +# regularization strength ``alpha`` to approximately 1e-6 over number of +# samples in oder to mimic the Ridge regressor whose L2 penalty term scales +# differently with the number of samples. from sklearn.linear_model import PoissonRegressor n_samples = df_train.shape[0] -poisson = Pipeline([ +poisson_glm = Pipeline([ ("preprocessor", linear_model_preprocessor), - ("regressor", PoissonRegressor(alpha=1e-6/n_samples, max_iter=1000)) + ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)) ]) -poisson.fit(df_train, df_train["Frequency"], - regressor__sample_weight=df_train["Exposure"]) +poisson_glm.fit(df_train, df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"]) print("PoissonRegressor evaluation:") -score_estimator(poisson, df_test) +score_estimator(poisson_glm, df_test) ############################################################################## # Finally, we will consider a non-linear model, namely Gradient Boosting # Regression Trees. Tree-based models do not require the categorical data to be # one-hot encoded: instead, we can encode each category label with an arbitrary # integer using :class:`preprocessing.OrdinalEncoder`. With this encoding, the -# trees will treat the categorical features as ordered features, which -# might not be always a desired behavior. However this effect is limited for -# deep enough trees which are able to recover the categorical nature of the +# trees will treat the categorical features as ordered features, which might +# not be always a desired behavior. However this effect is limited for deep +# enough trees which are able to recover the categorical nature of the # features. The main advantage of the :class:`preprocessing.OrdinalEncoder` # over the :class:`preprocessing.OneHotEncoder` is that it will make training # faster. +# +# Gradient Boosting also give the possibility to fit the trees with a Poisson +# loss (with an implicit log-link function) instead of the default +# least-squares loss. Here we only fit trees with the Poisson loss to keep this +# example concise. from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor @@ -254,15 +260,16 @@ def score_estimator(estimator, df_test): ], remainder="drop", ) -gbrt = Pipeline([ +poisson_gbrt = Pipeline([ ("preprocessor", tree_preprocessor), - ("regressor", HistGradientBoostingRegressor(max_leaf_nodes=128)), + ("regressor", HistGradientBoostingRegressor(loss="poisson", + max_leaf_nodes=128)), ]) -gbrt.fit(df_train, df_train["Frequency"], - regressor__sample_weight=df_train["Exposure"]) +poisson_gbrt.fit(df_train, df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"]) -print("Gradient Boosted Trees evaluation:") -score_estimator(gbrt, df_test) +print("Poisson Gradient Boosted Trees evaluation:") +score_estimator(poisson_gbrt, df_test) ############################################################################## @@ -294,7 +301,7 @@ def score_estimator(estimator, df_test): axes[row_idx, 0].set_ylim([1e1, 5e5]) axes[row_idx, 0].set_ylabel(label + " samples") - for idx, model in enumerate([ridge, poisson, gbrt]): + for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]): y_pred = model.predict(df) pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), @@ -311,17 +318,26 @@ def score_estimator(estimator, df_test): # models, we predict the expected frequency of a random variable, so we will # have necessarily fewer extreme values than for the observed realizations of # that random variable. Additionally, the normal distribution used in ``Ridge`` -# and ``HistGradientBoostingRegressor`` has a constant variance, while for the -# Poisson distribution used in ``PoissonRegressor``, the variance is +# has a constant variance, while for the Poisson distribution used in +# ``PoissonRegressor`` and ``HistGradientBoostingRegressor``, the variance is # proportional to the predicted expected value. # -# Thus, among the considered estimators, ``PoissonRegressor`` is a-priori -# better suited for modeling the long tail distribution of the non-negative -# data as compared to its ``Ridge`` counter-part. +# Thus, among the considered estimators, ``PoissonRegressor`` and +# ````HistGradientBoostingRegressor```` are a-priori better suited for modeling +# the long tail distribution of the non-negative data as compared to the +# ``Ridge`` model which makes a wrong assumption on the distribution of the +# target variable. +# +# The ``HistGradientBoostingRegressor`` estimator has the most flexibility and +# is able to predict higher expected values. # -# The ``HistGradientBoostingRegressor`` estimator has more flexibility and is -# able to predict higher expected values while still assuming a normal -# distribution with constant variance for the response variable. +# Note that we could have used the least squares loss for the +# ``HistGradientBoostingRegressor`` model. This would wrongly assume a normal +# distribution the response variable as for the `Ridge` model and possibly also +# lead to slightly negative predictions. However the gradient boosted trees +# would still perform relatively well and in particular better than +# ``PoissonRegressor`` thanks to the flexibility of the trees combined with +# the large number of training samples. # # Evaluation of the calibration of predictions # -------------------------------------------- @@ -382,17 +398,17 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 8)) plt.subplots_adjust(wspace=0.3) -for axi, model in zip(ax.ravel(), [dummy, ridge, poisson, gbrt]): +for axi, model in zip(ax.ravel(), [ridge_glm, poisson_glm, poisson_gbrt, + dummy]): y_pred = model.predict(df_test) y_true = df_test["Frequency"].values exposure = df_test["Exposure"].values q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( y_true, y_pred, sample_weight=exposure, n_bins=10) - # Name of the model after the class of the estimator used in the last step - # of the pipeline. - model_name = model.steps[-1][1].__class__.__name__ - print(f"Predicted number of claims by {model_name}: " + # Name of the model after the estimator used in the last step of the + # pipeline. + print(f"Predicted number of claims by {model[-1]}: " f"{np.sum(y_pred * exposure):.1f}") axi.plot(q, y_pred_seg, marker='x', linestyle="--", label="predictions") @@ -400,7 +416,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, axi.set_xlim(0, 1.0) axi.set_ylim(0, 0.5) axi.set( - title=model[-1].__class__.__name__, + title=model[-1], xlabel='Fraction of samples sorted by y_pred', ylabel='Mean Frequency (y_pred)' ) @@ -409,8 +425,8 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, ############################################################################### # The dummy regression model predicts a constant frequency. This model is not -# attribute the same tied rank to all samples but is none-the-less well -# calibrated. +# attribute the same tied rank to all samples but is none-the-less globally +# well calibrated (to estimate the mean frequency of the entire population). # # The ``Ridge`` regression model can predict very low expected frequencies that # do not match the data. It can therefore severly under-estimate the risk for @@ -460,13 +476,12 @@ def lorenz_curve(y_true, y_pred, exposure): fig, ax = plt.subplots(figsize=(8, 8)) -for model in [dummy, ridge, poisson, gbrt]: +for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]: y_pred = model.predict(df_test) cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], y_pred, df_test["Exposure"]) gini = 1 - 2 * auc(cum_exposure, cum_claims) - label = "{} (Gini: {:.2f})".format( - model[-1].__class__.__name__, gini) + label = "{} (Gini: {:.2f})".format(model[-1], gini) ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test @@ -499,7 +514,8 @@ def lorenz_curve(y_true, y_pred, exposure): # # This last point is expected due to the nature of the problem: the occurrence # of accidents is mostly dominated by circumstantial causes that are not -# captured in the columns of the dataset or that are indeed random. +# captured in the columns of the dataset and can indeed be considered as purely +# random. # # The linear models assume no interactions between the input variables which # likely causes under-fitting. Inserting a polynomial feature extractor From 65b1125ba67dbdf49ecfc91ef4f1c87939316cb8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 28 Apr 2020 17:07:00 +0200 Subject: [PATCH 17/22] Apply suggestions from code review Co-Authored-By: Nicolas Hug --- .../plot_poisson_regression_non_normal_loss.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index bf6b63a83469a..af4ff51a256f9 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -19,7 +19,7 @@ - The **exposure** is the duration of the insurance coverage of a given policy, in years. -- the claim **frequency** is the number of claims divided by the exposure, +- The claim **frequency** is the number of claims divided by the exposure, typically measured in number of claims per year. In this dataset, each sample corresponds to an insurance policy. Available @@ -124,7 +124,7 @@ # # It is worth noting that more than 93% of policyholders have zero claims. If # we were to convert this problem into a binary classification task, it would -# be significantly imbalanced. +# be significantly imbalanced, and even a simplistic model that would only predict mean can achieve an accuracy of 93%. # # To evaluate the pertinence of the used metrics, we will consider as a # baseline a "dummy" estimator that constantly predicts the mean frequency of @@ -186,7 +186,7 @@ def score_estimator(estimator, df_test): # # We start by modeling the target variable with the (l2 penalized) least # squares linear regression model, more comonly known as Ridge regression. We -# use a low penalization as we expect such a linear model to under-fit on such +# use a low penalization `alpha`, as we expect such a linear model to under-fit on such # a large dataset. from sklearn.linear_model import Ridge @@ -212,7 +212,7 @@ def score_estimator(estimator, df_test): ############################################################################## # Next we fit the Poisson regressor on the target variable. We set the # regularization strength ``alpha`` to approximately 1e-6 over number of -# samples in oder to mimic the Ridge regressor whose L2 penalty term scales +# samples (i.e. `1e-12`) in order to mimic the Ridge regressor whose L2 penalty term scales # differently with the number of samples. from sklearn.linear_model import PoissonRegressor @@ -233,7 +233,7 @@ def score_estimator(estimator, df_test): # Finally, we will consider a non-linear model, namely Gradient Boosting # Regression Trees. Tree-based models do not require the categorical data to be # one-hot encoded: instead, we can encode each category label with an arbitrary -# integer using :class:`preprocessing.OrdinalEncoder`. With this encoding, the +# integer using :class:`~sklearn.preprocessing.OrdinalEncoder`. With this encoding, the # trees will treat the categorical features as ordered features, which might # not be always a desired behavior. However this effect is limited for deep # enough trees which are able to recover the categorical nature of the @@ -241,7 +241,7 @@ def score_estimator(estimator, df_test): # over the :class:`preprocessing.OneHotEncoder` is that it will make training # faster. # -# Gradient Boosting also give the possibility to fit the trees with a Poisson +# Gradient Boosting also gives the possibility to fit the trees with a Poisson # loss (with an implicit log-link function) instead of the default # least-squares loss. Here we only fit trees with the Poisson loss to keep this # example concise. @@ -333,7 +333,7 @@ def score_estimator(estimator, df_test): # # Note that we could have used the least squares loss for the # ``HistGradientBoostingRegressor`` model. This would wrongly assume a normal -# distribution the response variable as for the `Ridge` model and possibly also +# distribution the response variable as for the `Ridge` model, and possibly also # lead to slightly negative predictions. However the gradient boosted trees # would still perform relatively well and in particular better than # ``PoissonRegressor`` thanks to the flexibility of the trees combined with @@ -519,7 +519,7 @@ def lorenz_curve(y_true, y_pred, exposure): # # The linear models assume no interactions between the input variables which # likely causes under-fitting. Inserting a polynomial feature extractor -# (func:`sklearn.preprocessing.PolynomialFeatures`) indeed increases their +# (:func:`~sklearn.preprocessing.PolynomialFeatures`) indeed increases their # discrimative power by 2 points of Gini index. In particular it improves the # ability of the models to identify the top 5% riskiest profiles. # From deba073df4a46aa020d6709064e7f1347f3667f7 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 28 Apr 2020 17:34:52 +0200 Subject: [PATCH 18/22] Phrasing, formatting, fixes --- ...plot_poisson_regression_non_normal_loss.py | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index af4ff51a256f9..5730e6c260db3 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -5,8 +5,9 @@ This example illustrates the use of log-linear Poisson regression on the `French Motor Third-Party Liability Claims dataset -`_ from [1]_ and compares it with models -learned with least squared error. +`_ from [1]_ and compares it with a linear +model fitted with the usual least squared error and a non-linear GBRT model +fitted with the Poisson loss (and a log-link). A few definitions: @@ -124,7 +125,8 @@ # # It is worth noting that more than 93% of policyholders have zero claims. If # we were to convert this problem into a binary classification task, it would -# be significantly imbalanced, and even a simplistic model that would only predict mean can achieve an accuracy of 93%. +# be significantly imbalanced, and even a simplistic model that would only +# predict mean can achieve an accuracy of 93%. # # To evaluate the pertinence of the used metrics, we will consider as a # baseline a "dummy" estimator that constantly predicts the mean frequency of @@ -186,8 +188,8 @@ def score_estimator(estimator, df_test): # # We start by modeling the target variable with the (l2 penalized) least # squares linear regression model, more comonly known as Ridge regression. We -# use a low penalization `alpha`, as we expect such a linear model to under-fit on such -# a large dataset. +# use a low penalization `alpha`, as we expect such a linear model to under-fit +# on such a large dataset. from sklearn.linear_model import Ridge @@ -212,8 +214,8 @@ def score_estimator(estimator, df_test): ############################################################################## # Next we fit the Poisson regressor on the target variable. We set the # regularization strength ``alpha`` to approximately 1e-6 over number of -# samples (i.e. `1e-12`) in order to mimic the Ridge regressor whose L2 penalty term scales -# differently with the number of samples. +# samples (i.e. `1e-12`) in order to mimic the Ridge regressor whose L2 penalty +# term scales differently with the number of samples. from sklearn.linear_model import PoissonRegressor @@ -233,10 +235,10 @@ def score_estimator(estimator, df_test): # Finally, we will consider a non-linear model, namely Gradient Boosting # Regression Trees. Tree-based models do not require the categorical data to be # one-hot encoded: instead, we can encode each category label with an arbitrary -# integer using :class:`~sklearn.preprocessing.OrdinalEncoder`. With this encoding, the -# trees will treat the categorical features as ordered features, which might -# not be always a desired behavior. However this effect is limited for deep -# enough trees which are able to recover the categorical nature of the +# integer using :class:`~sklearn.preprocessing.OrdinalEncoder`. With this +# encoding, the trees will treat the categorical features as ordered features, +# which might not be always a desired behavior. However this effect is limited +# for deep enough trees which are able to recover the categorical nature of the # features. The main advantage of the :class:`preprocessing.OrdinalEncoder` # over the :class:`preprocessing.OneHotEncoder` is that it will make training # faster. @@ -333,11 +335,11 @@ def score_estimator(estimator, df_test): # # Note that we could have used the least squares loss for the # ``HistGradientBoostingRegressor`` model. This would wrongly assume a normal -# distribution the response variable as for the `Ridge` model, and possibly also -# lead to slightly negative predictions. However the gradient boosted trees -# would still perform relatively well and in particular better than -# ``PoissonRegressor`` thanks to the flexibility of the trees combined with -# the large number of training samples. +# distribution the response variable as for the `Ridge` model, and possibly +# also lead to slightly negative predictions. However the gradient boosted +# trees would still perform relatively well and in particular better than +# ``PoissonRegressor`` thanks to the flexibility of the trees combined with the +# large number of training samples. # # Evaluation of the calibration of predictions # -------------------------------------------- @@ -549,10 +551,6 @@ def lorenz_curve(y_true, y_pred, exposure): # models are comparable and well below the ranking power of the Gradient # Boosting Regression Trees. # -# - The non-linear Gradient Boosting Regression Trees model does not seem to -# suffer from significant mis-calibration issues (despite the use of a least -# squares loss). -# # - The Poisson deviance computed as an evaluation metric reflects both the # calibration and the ranking power of the model. It also makes a linear # assumption on the ideal relationship between the expected value and the From 53e402f62b0723b01d9df659c8d0da80893ed988 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 28 Apr 2020 17:37:21 +0200 Subject: [PATCH 19/22] Add link to OpenML page --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 5730e6c260db3..f61f586068277 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -49,8 +49,8 @@ # The French Motor Third-Party Liability Claims dataset # ----------------------------------------------------- # -# Let's load the motor claim dataset. We ignore the severity data for this -# study for the sake of simplicitly: +# Let's load the motor claim dataset from OpenML: +# https://www.openml.org/d/41214 from sklearn.datasets import fetch_openml From 02cd5999268c8989acb59b2391751e01e7641ce2 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 28 Apr 2020 17:56:05 +0200 Subject: [PATCH 20/22] mask instead of clip, fix refs, fix backticks, note on prediction histograms --- ...plot_poisson_regression_non_normal_loss.py | 46 ++++++++++--------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index f61f586068277..2abf81a506ce3 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -165,18 +165,19 @@ def score_estimator(estimator, df_test): mean_absolute_error(df_test["Frequency"], y_pred, sample_weight=df_test["Exposure"])) - # ignore non-positive predictions, as they are invalid for - # the Poisson deviance + # Ignore non-positive predictions, as they are invalid for + # the Poisson deviance. mask = y_pred > 0 if (~mask).any(): - n_clipped, n_samples = (~mask).sum(), mask.shape[0] - print(f"WARNING: Estimator yields non-positive predictions for " - f"{n_clipped} samples out of {n_samples}. These predictions " - f"will be clipped while computing the Poisson deviance.") + n_masked, n_samples = (~mask).sum(), mask.shape[0] + print(f"WARNING: Estimator yields invalid, non-positive predictions " + f" for {n_masked} samples out of {n_samples}. These predictions " + f"are ignored when computing the Poisson deviance.") print("mean Poisson deviance: %.3f" % - mean_poisson_deviance(df_test["Frequency"], y_pred.clip(min=1e-6), - sample_weight=df_test["Exposure"])) + mean_poisson_deviance(df_test["Frequency"][mask], + y_pred[mask], + sample_weight=df_test["Exposure"][mask])) print("Constant mean frequency evaluation:") @@ -202,10 +203,10 @@ def score_estimator(estimator, df_test): ############################################################################## # The Poisson deviance cannot be computed on non-positive values predicted by -# the model. For models that do return a few non-positive predictions -# (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples, +# the model. For models that do return a few non-positive predictions (e.g. +# :class:`~sklearn.linear_model.Ridge`) we ignore the corresponding samples, # meaning that the obtained Poisson deviance is approximate. An alternative -# approach could be to use :class:`compose.TransformedTargetRegressor` +# approach could be to use :class:`~sklearn.compose.TransformedTargetRegressor` # meta-estimator to map ``y_pred`` to a strictly positive domain. print("Ridge evaluation:") @@ -239,8 +240,9 @@ def score_estimator(estimator, df_test): # encoding, the trees will treat the categorical features as ordered features, # which might not be always a desired behavior. However this effect is limited # for deep enough trees which are able to recover the categorical nature of the -# features. The main advantage of the :class:`preprocessing.OrdinalEncoder` -# over the :class:`preprocessing.OneHotEncoder` is that it will make training +# features. The main advantage of the +# :class:`~sklearn.preprocessing.OrdinalEncoder` over the +# :class:`~sklearn.preprocessing.OneHotEncoder` is that it will make training # faster. # # Gradient Boosting also gives the possibility to fit the trees with a Poisson @@ -319,16 +321,18 @@ def score_estimator(estimator, df_test): # The experimental data presents a long tail distribution for ``y``. In all # models, we predict the expected frequency of a random variable, so we will # have necessarily fewer extreme values than for the observed realizations of -# that random variable. Additionally, the normal distribution used in ``Ridge`` -# has a constant variance, while for the Poisson distribution used in -# ``PoissonRegressor`` and ``HistGradientBoostingRegressor``, the variance is -# proportional to the predicted expected value. +# that random variable. This explains that the mode of the histograms of model +# predictions doesn't necessarily correspond to the smallest value. +# Additionally, the normal distribution used in ``Ridge`` has a constant +# variance, while for the Poisson distribution used in ``PoissonRegressor`` and +# ``HistGradientBoostingRegressor``, the variance is proportional to the +# predicted expected value. # # Thus, among the considered estimators, ``PoissonRegressor`` and -# ````HistGradientBoostingRegressor```` are a-priori better suited for modeling -# the long tail distribution of the non-negative data as compared to the -# ``Ridge`` model which makes a wrong assumption on the distribution of the -# target variable. +# ``HistGradientBoostingRegressor`` are a-priori better suited for modeling the +# long tail distribution of the non-negative data as compared to the ``Ridge`` +# model which makes a wrong assumption on the distribution of the target +# variable. # # The ``HistGradientBoostingRegressor`` estimator has the most flexibility and # is able to predict higher expected values. From e9725271660281f5250506dbfd9aeda7e2a760c5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 28 Apr 2020 18:00:38 +0200 Subject: [PATCH 21/22] Typo --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 2abf81a506ce3..8d0b38b42415d 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -430,7 +430,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, plt.tight_layout() ############################################################################### -# The dummy regression model predicts a constant frequency. This model is not +# The dummy regression model predicts a constant frequency. This model does not # attribute the same tied rank to all samples but is none-the-less globally # well calibrated (to estimate the mean frequency of the entire population). # From 8f80a4df756c8e7322962d211778f12eac82d9b9 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 28 Apr 2020 22:51:15 +0200 Subject: [PATCH 22/22] Phrasing for the Lorenz curves --- .../plot_poisson_regression_non_normal_loss.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 8d0b38b42415d..4fc3bea7bda51 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -455,9 +455,10 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # absolute value of the prediction. In this case, the model evaluation would # cast the problem as a ranking problem rather than a regression problem. # -# To compare the 3 models from this perspective, one can plot the fraction of -# the number of claims vs the fraction of exposure for test samples ordered by -# the model predictions, from safest to riskiest according to each model. +# To compare the 3 models from this perspective, one can plot the cumulative +# proportion of claims vs the cumulative proportion of exposure for the test +# samples order by the model predictions, from safest to riskiest according to +# each model. # # This plot is called a Lorenz curve and can be summarized by the Gini index: @@ -502,9 +503,9 @@ def lorenz_curve(y_true, y_pred, exposure): ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline") ax.set( - title="Cumulated number of claims by model", - xlabel='Fraction of exposure (from safest to riskiest)', - ylabel='Fraction of number of claims' + title="Lorenz curves by model", + xlabel='Cumulative proportion of exposure (from safest to riskiest)', + ylabel='Cumulative proportion of claims' ) ax.legend(loc="upper left")