From 42400749b97a7f4d4ac750a295a86d9b5d9fd7cb Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 2 Aug 2023 11:19:50 +0200 Subject: [PATCH 01/52] DOC Add example showcasing HGBT regression --- doc/modules/ensemble.rst | 3 +- examples/ensemble/plot_hgbt_regression.py | 448 ++++++++++++++++++ .../plot_release_highlights_1_1_0.py | 2 + 3 files changed, 452 insertions(+), 1 deletion(-) create mode 100644 examples/ensemble/plot_hgbt_regression.py diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 36eed98da0f6b..0585f8289ed55 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -129,6 +129,8 @@ Note that for technical reasons, using a scorer is significantly slower than using the loss. By default, early-stopping is performed if there are at least 10,000 samples in the training set, and uses the validation loss. +.. _nan_support_hgbt: + Missing values support ^^^^^^^^^^^^^^^^^^^^^^ @@ -1634,4 +1636,3 @@ minimum required number of samples to consider a split ``min_samples_split``). .. [HTF] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning Ed. 2", Springer, 2009. - diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py new file mode 100644 index 0000000000000..ba651563a9d33 --- /dev/null +++ b/examples/ensemble/plot_hgbt_regression.py @@ -0,0 +1,448 @@ +""" +=========================================================== +Decision Tree Regression with HistGradientBoostingRegressor +=========================================================== + +:ref:`histogram_based_gradient_boosting` (HGBT) models can be a competitive +alternative to random forests, especially when the number of samples is larger +than tens of thousands of samples (see +:ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`). + +HGBT models have additional advantages such as: + +- :ref:`categorical_support_gbdt` (see + :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`) +- :ref:`nan_support_hgbt`, which avoids the need for an imputer. +- :ref:`Quantile loss support ` +- :ref:`monotonic_cst_gbdt` + +This example aims at showcasing the last three points in a real setting. +""" + +# %% +# Author: Arturo Amor +# +# License: BSD 3 clause +# +# Preparing the data +# ================== +# The `electricity dataset `_ consists of data +# collected from the Australian New South Wales Electricity Market. In this +# market, prices are not fixed and are affected by supply and demand. They are +# set every five minutes. Electricity transfers to/from the neighboring state of +# Victoria were done to alleviate fluctuations. +# +# The dataset (originally named ELEC2) contains 45,312 instances dated from 7 +# May 1996 to 5 December 1998. Each example of the dataset refers to a period of +# 30 minutes, i.e. there are 48 instances for each time period of one day. Each +# example on the dataset has 5 fields, the day of week, the time stamp, the New +# South Wales electricity demand, the Victoria electricity demand. It is +# originally a classification task, but here we use it as a regression where the +# target is the scheduled electricity transfer between states. + +from sklearn.datasets import fetch_openml + +electricity = fetch_openml( + name="electricity", version=1, as_frame=True, parser="pandas" +) +df = electricity.frame +X = df.drop(columns=["transfer", "class"]) +y = df["transfer"] +X + +# %% +# Let us explore the hourly electricity transfer over different days of the week: + +import matplotlib.pyplot as plt +import seaborn as sns + +colors = sns.color_palette("colorblind") + +fig, ax = plt.subplots(figsize=(15, 10)) +pointplot = sns.pointplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax) +handles, lables = ax.get_legend_handles_labels() +ax.set( + title="Hourly energy transfer for different days of the week", + xticks=[i * 2 for i in range(24)], + xticklabels=list(range(24)), + xlabel="Time of the day", + ylabel="Normalized energy transfer", +) +_ = ax.legend(handles, ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]) + +# %% +# Notice energy transfer increases systematically during weekends. +# +# Effect of number of trees in HistGradientBoostingRegressor +# ========================================================== +# For the sake of illustrating the effect of the (maximum) number of trees, we +# train a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` over the +# daily electricity transfer using the whole dataset. Then we visualize its +# predictions depending on the `max_iter` parameter. + +from sklearn.ensemble import HistGradientBoostingRegressor + +max_iter_list = [10, 50] + +fig, ax = plt.subplots(figsize=(12, 4)) +average_week_demand = df.groupby(["day", "period"])["transfer"].mean() +average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax) + +for idx, max_iter in enumerate(max_iter_list): + hgbt = HistGradientBoostingRegressor(max_iter=max_iter) + hgbt.fit(X, y) + y_pred = hgbt.predict(X) + prediction_df = df.copy() + prediction_df["y_pred"] = y_pred + average_pred = prediction_df.groupby(["day", "period"])["y_pred"].mean() + average_pred.plot( + color=colors[idx + 1], label=f"max_iter={max_iter}", linewidth=2, ax=ax + ) +ax.set( + title="Average daily energy transfer during the week", + xticks=[(i + 0.2) * 48 for i in range(7)], + xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"], + xlabel="Time of the week", + ylabel="Normalized energy transfer", +) +_ = ax.legend() + +# %% +# With just a few iterations, HGBT models can achieve convergence (see +# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`). +# +# Support for missing values +# ========================== +# HGBT models have native support of missing values. During training, the tree +# grower decides where samples with missing values should go (left or right +# child) at each split, based on the potential gain. When predicting, these +# samples are sent to either child accordingly. If a feature had no missing +# values during training, samples with missing values for that feature are sent +# to the child with the most samples. +# +# Missing Completely At Random (MCAR) +# ----------------------------------- +# +# The missingness does not depend on the observed data or the unobserved data. +# It's completely random. We can simulate such scenario by randomly replacing +# values from randomly selected features with `Nan` values. + +import numpy as np + +from sklearn.model_selection import TimeSeriesSplit, cross_validate + +np.random.seed(42) + +ts_cv = TimeSeriesSplit(n_splits=5, gap=48, max_train_size=10000, test_size=1000) +train_0, test_0 = next(ts_cv.split(df)) +last_days = slice(-192, None) +total_cells = X.shape[0] * X.shape[1] +missing_fraction_list = [0, 0.01, 0.03] + +fig, ax = plt.subplots(figsize=(12, 6)) +ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") +hgbt = HistGradientBoostingRegressor() + +for missing_fraction in missing_fraction_list: + num_missing_cells = int(total_cells * missing_fraction) + row_indices = np.random.choice(X.shape[0], num_missing_cells, replace=True) + col_indices = np.random.choice(X.shape[1], num_missing_cells, replace=True) + X = df.drop(columns=["transfer", "class"]) + X.iloc[row_indices, col_indices] = np.nan + + hgbt.fit(X.iloc[train_0], y.iloc[train_0]) + hgbt_predictions = hgbt.predict(X.iloc[test_0]) + cv_results = cross_validate( + hgbt, + X, + y, + cv=ts_cv, + scoring="neg_root_mean_squared_error", + ) + rmse = -cv_results["test_score"] + ax.plot( + hgbt_predictions[last_days], + label=( + f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-" + f" {rmse.std():.2f}" + ), + alpha=0.5, + ) +ax.set( + title="Daily energy transfer predictions on data with MCAR values", + xticks=[(i + 0.25) * 48 for i in range(4)], + xticklabels=["Tue", "Wed", "Thu", "Fri"], + xlabel="Time of the week", + ylabel="Normalized energy transfer", +) +_ = ax.legend() + +# %% +# Missing At Random (MAR) +# ----------------------- +# +# The missingness depends on the observed data but never on unobserved data. +# Here, the missingness in "vicdemand" is set to depend on the value of the +# observed feature "nswprice". + +missing_fraction_list = [0, 0.5, 1.0] + +fig, ax = plt.subplots(figsize=(12, 6)) +ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") + +for missing_fraction in missing_fraction_list: + X = df.drop(columns=["transfer", "class"]) + mask = X["nswprice"] < X["nswprice"].quantile(missing_fraction) + X["vicprice"] = X["vicprice"].mask(mask, np.nan) + X["vicdemand"] = X["vicdemand"].mask(mask, np.nan) + + hgbt = HistGradientBoostingRegressor() + hgbt.fit(X.iloc[train_0], y.iloc[train_0]) + hgbt_predictions = hgbt.predict(X.iloc[test_0]) + cv_results = cross_validate( + hgbt, + X, + y, + cv=ts_cv, + scoring="neg_root_mean_squared_error", + ) + rmse = -cv_results["test_score"] + ax.plot( + hgbt_predictions[last_days], + label=( + f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-" + f" {rmse.std():.2f}" + ), + alpha=0.5, + ) +ax.set( + title="Daily energy transfer predictions on data with MAR values", + xticks=[(i + 0.25) * 48 for i in range(4)], + xticklabels=["Tue", "Wed", "Thu", "Fri"], + xlabel="Time of the week", + ylabel="Normalized energy transfer", +) +_ = ax.legend() + +# %% +# In this case the features are highly correlated and therefore MAR values +# do not degrade the predictivity of the model even when completely removing +# the feature "vicprice". +# +# Missing Not At Random (MNAR) +# ---------------------------- +# +# The missingness depends on the unobserved data. In particular, if the +# probability of a value being missing in a variable is dependent on the values +# of that variable itself. Here, we set the missingness to depend on the +# unobserved feature "class". + +import pandas as pd + +fig, ax = plt.subplots(figsize=(12, 6)) +ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") + +for missing_fraction in missing_fraction_list: + X = df.drop(columns=["transfer", "class"]) + mask = df["class"] == "DOWN" + true_indices = mask[mask].index + n_keep = int(len(true_indices) * missing_fraction) + keep_indices = np.random.choice(true_indices, size=n_keep, replace=False) + mask = pd.Series(False, index=mask.index) + + # Set the randomly selected true indices to True in the new mask + mask.loc[keep_indices] = True + X["vicprice"] = X["vicprice"].mask(mask, np.nan) + X["vicdemand"] = X["vicdemand"].mask(mask, np.nan) + + hgbt = HistGradientBoostingRegressor() + hgbt.fit(X.iloc[train_0], y.iloc[train_0]) + hgbt_predictions = hgbt.predict(X.iloc[test_0]) + cv_results = cross_validate( + hgbt, + X, + y, + cv=ts_cv, + scoring="neg_root_mean_squared_error", + ) + rmse = -cv_results["test_score"] + ax.plot( + hgbt_predictions[last_days], + label=( + f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-" + f" {rmse.std():.2f}" + ), + alpha=0.5, + ) +ax.set( + title="Daily energy transfer predictions on data with MNAR values", + xticks=[(i + 0.25) * 48 for i in range(4)], + xticklabels=["Tue", "Wed", "Thu", "Fri"], + xlabel="Time of the week", + ylabel="Normalized energy transfer", +) +_ = ax.legend() + +# %% +# Support for quantile loss +# ========================= +# +# The quantile loss in regression enables a view of the potential variability in +# predictions. For instance, predicting the 5th and 95th percentiles can provide +# a 90% prediction interval, i.e. the range within which we expect the true +# value to fall with 90% probability. + +from sklearn.metrics import make_scorer, mean_pinball_loss + +quantiles = [0.95, 0.05] +predictions = [] +X = df.drop(columns=["transfer", "class"]) + +fig, ax = plt.subplots(figsize=(12, 6)) +ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") + +for quantile in quantiles: + hgbt = HistGradientBoostingRegressor(loss="quantile", quantile=quantile) + hgbt.fit(X.iloc[train_0], y.iloc[train_0]) + hgbt_predictions = hgbt.predict(X.iloc[test_0]) + + predictions.append(hgbt_predictions) + cv_results = cross_validate( + hgbt, + X, + y, + cv=ts_cv, + scoring=make_scorer(mean_pinball_loss, alpha=quantile), + ) + score = cv_results["test_score"] + ax.plot( + hgbt_predictions[last_days], + label=( + f"quantile={quantile}, pinball loss={score.mean():.3f} +/-" + f" {score.std():.3f}" + ), + alpha=0.5, + ) + +ax.fill_between( + range(len(predictions[0][last_days])), + predictions[0][last_days], + predictions[1][last_days], + color=colors[0], + alpha=0.1, +) +ax.set( + title="Daily energy transfer predictions with quantile loss", + xticks=[(i + 0.25) * 48 for i in range(4)], + xticklabels=["Tue", "Wed", "Thu", "Fri"], + xlabel="Time of the week", + ylabel="Normalized energy transfer", +) +_ = ax.legend() + +# %% +# Keep in mind that one can still improve the calibration of our model by: +# +# - collecting more data-points (in case the model is overfitting); +# - better tuning of the model hyper-parameters (for instance you could try +# max_iter=300, max_leaf_nodes=64) and make sure the model is not over-fitting +# too much (e.g. by plotting the validation losses per boosting iteration and +# using early stopping); +# - engineering more predictive features from the same data. This is especially +# useful for linear quantile regression (not covered in this tutorial); +# - try other kinds of quantile regression models, for instance Quantile +# Forests. +# +# Monotonic Constraints +# --------------------- +# +# Given specific domain knowledge that requires the relationship between a +# feature and the target to be monotonically increasing or decreasing, one can +# enforce such behaviour in the predictions of a HGBT model using monotonic +# constraints. This makes the model more interpretable and prevents overfitting. +# Monotonic constraints can also be used to enforce specific regulatory +# requirements, ensure compliance and align with ethical considerations. +# +# In the present example, the policy of transfering energy from Victoria to New +# South Wales is meant to alleviate price fluctuations, meaning that the model +# predictions have to enforce such goal, i.e. transfer should increase with +# price and demand in New South Wales, but also decrease with price and demand +# in Victoria, in order to benefit both populations. +# +# To create the monotonic constraints, we use :class:`numpy.select` to assign +# `1` to the positions corresponding to columns "nswdemand" and "nswprice", `-1` +# to the positions corresponding to columns "vicdemand" and "vicprice", and `0` +# elsewhere. We then visualize the partial dependence on said features: + +from sklearn.inspection import PartialDependenceDisplay + +conditions = [ + (X.columns == "nswdemand") | (X.columns == "nswprice"), + (X.columns == "vicdemand") | (X.columns == "vicprice"), +] +choices = [1, -1] + +monotonic_cst = np.select(conditions, choices, default=0) + + +gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y) +gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y) + +fig, ax = plt.subplots(nrows=2, figsize=(15, 10)) +disp = PartialDependenceDisplay.from_estimator( + gbdt_no_cst, + X, + features=["nswdemand", "nswprice"], + line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"}, + ax=ax[0], +) + +PartialDependenceDisplay.from_estimator( + gbdt_cst, + X, + features=["nswdemand", "nswprice"], + line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"}, + ax=disp.axes_, +) +disp = PartialDependenceDisplay.from_estimator( + gbdt_no_cst, + X, + features=["vicdemand", "vicprice"], + line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"}, + ax=ax[1], +) + +PartialDependenceDisplay.from_estimator( + gbdt_cst, + X, + features=["vicdemand", "vicprice"], + line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"}, + ax=disp.axes_, +) + +plt.legend() +plt.show() + +# %% +# Indeed, we can verify that the predictive quality of the model is not degraded +# by introducing the monotonic constraints: + +cv_results = cross_validate( + gbdt_no_cst, + X, + y, + cv=ts_cv, + scoring="neg_root_mean_squared_error", +) +rmse = -cv_results["test_score"] +print(f"RMSE without constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}") + +cv_results = cross_validate( + gbdt_cst, + X, + y, + cv=ts_cv, + scoring="neg_root_mean_squared_error", +) +rmse = -cv_results["test_score"] +print(f"RMSE with constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}") diff --git a/examples/release_highlights/plot_release_highlights_1_1_0.py b/examples/release_highlights/plot_release_highlights_1_1_0.py index f6432cf15037c..088919565315a 100644 --- a/examples/release_highlights/plot_release_highlights_1_1_0.py +++ b/examples/release_highlights/plot_release_highlights_1_1_0.py @@ -22,6 +22,8 @@ """ # %% +# .. _quantile_support_hgbdt: +# # Quantile loss in :class:`ensemble.HistGradientBoostingRegressor` # ---------------------------------------------------------------- # :class:`~ensemble.HistGradientBoostingRegressor` can model quantiles with From 9728566a790e2fee3eac2f7e027a9f02b14c1377 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 2 Aug 2023 11:48:51 +0200 Subject: [PATCH 02/52] Replace the landing-page figure --- doc/templates/index.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/templates/index.html b/doc/templates/index.html index fc0362f4e379f..1a83f29f69e9f 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -70,8 +70,8 @@

Machine Learning in and more...

Examples From 7842e6d8b7c6b5e9a01452a262ed590a46756941 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 2 Aug 2023 16:20:04 +0200 Subject: [PATCH 03/52] Several tweaks --- examples/ensemble/plot_hgbt_regression.py | 32 +++++++++-------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index ba651563a9d33..5f5a2fb6814e6 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -11,10 +11,10 @@ HGBT models have additional advantages such as: - :ref:`categorical_support_gbdt` (see - :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`) + :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). - :ref:`nan_support_hgbt`, which avoids the need for an imputer. -- :ref:`Quantile loss support ` -- :ref:`monotonic_cst_gbdt` +- :ref:`Quantile loss support `. +- :ref:`monotonic_cst_gbdt`. This example aims at showcasing the last three points in a real setting. """ @@ -84,7 +84,7 @@ max_iter_list = [10, 50] -fig, ax = plt.subplots(figsize=(12, 4)) +fig, ax = plt.subplots(figsize=(10, 5)) average_week_demand = df.groupby(["day", "period"])["transfer"].mean() average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax) @@ -243,7 +243,7 @@ ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") for missing_fraction in missing_fraction_list: - X = df.drop(columns=["transfer", "class"]) + X = df.drop(columns=["transfer", "class"]) # reset X mask = df["class"] == "DOWN" true_indices = mask[mask].index n_keep = int(len(true_indices) * missing_fraction) @@ -296,7 +296,7 @@ quantiles = [0.95, 0.05] predictions = [] -X = df.drop(columns=["transfer", "class"]) +X = df.drop(columns=["transfer", "class"]) # reset X fig, ax = plt.subplots(figsize=(12, 6)) ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") @@ -344,17 +344,15 @@ # Keep in mind that one can still improve the calibration of our model by: # # - collecting more data-points (in case the model is overfitting); -# - better tuning of the model hyper-parameters (for instance you could try -# max_iter=300, max_leaf_nodes=64) and make sure the model is not over-fitting -# too much (e.g. by plotting the validation losses per boosting iteration and -# using early stopping); +# - better tuning of the model hyper-parameters (see +# :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`) +# and make sure the model is not over-fitting; # - engineering more predictive features from the same data. This is especially # useful for linear quantile regression (not covered in this tutorial); -# - try other kinds of quantile regression models, for instance Quantile -# Forests. +# - try other kinds of quantile regression models, such as Quantile Forests. # # Monotonic Constraints -# --------------------- +# ===================== # # Given specific domain knowledge that requires the relationship between a # feature and the target to be monotonically increasing or decreasing, one can @@ -381,10 +379,8 @@ (X.columns == "vicdemand") | (X.columns == "vicprice"), ] choices = [1, -1] - monotonic_cst = np.select(conditions, choices, default=0) - gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y) gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y) @@ -396,7 +392,6 @@ line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"}, ax=ax[0], ) - PartialDependenceDisplay.from_estimator( gbdt_cst, X, @@ -411,7 +406,6 @@ line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"}, ax=ax[1], ) - PartialDependenceDisplay.from_estimator( gbdt_cst, X, @@ -419,9 +413,7 @@ line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"}, ax=disp.axes_, ) - -plt.legend() -plt.show() +_ = plt.legend() # %% # Indeed, we can verify that the predictive quality of the model is not degraded From f5ac584476e7843fca4eb52e419e7305d7f91e93 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 2 Aug 2023 16:26:35 +0200 Subject: [PATCH 04/52] Wording --- examples/ensemble/plot_hgbt_regression.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 5f5a2fb6814e6..c8aaa86f71d0b 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -344,12 +344,9 @@ # Keep in mind that one can still improve the calibration of our model by: # # - collecting more data-points (in case the model is overfitting); -# - better tuning of the model hyper-parameters (see -# :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`) -# and make sure the model is not over-fitting; -# - engineering more predictive features from the same data. This is especially -# useful for linear quantile regression (not covered in this tutorial); -# - try other kinds of quantile regression models, such as Quantile Forests. +# - better tuning of the model hyperparameters (see +# :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`); +# - engineering more predictive features from the same data. # # Monotonic Constraints # ===================== From 353329db25288c97d8cd3c5d2ecb08995f12d830 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 2 Aug 2023 18:02:41 +0200 Subject: [PATCH 05/52] Add cross-links from other examples --- examples/ensemble/plot_adaboost_regression.py | 4 ++++ .../ensemble/plot_forest_hist_grad_boosting_comparison.py | 4 +++- examples/ensemble/plot_gradient_boosting_categorical.py | 4 ++++ examples/ensemble/plot_gradient_boosting_quantile.py | 4 +++- examples/ensemble/plot_gradient_boosting_regression.py | 5 ++++- examples/ensemble/plot_hgbt_regression.py | 3 ++- .../release_highlights/plot_release_highlights_0_23_0.py | 3 ++- examples/release_highlights/plot_release_highlights_1_1_0.py | 3 +++ examples/release_highlights/plot_release_highlights_1_3_0.py | 4 +++- 9 files changed, 28 insertions(+), 6 deletions(-) diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py index c2aa7e558c07d..98d3699ab161c 100644 --- a/examples/ensemble/plot_adaboost_regression.py +++ b/examples/ensemble/plot_adaboost_regression.py @@ -9,6 +9,10 @@ regressor. As the number of boosts is increased the regressor can fit more detail. +See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an +example showcasing the benefits of using more robust regressions such as +:class:`~ensemble.HistGradientBoostingRegressor`. + .. [1] `H. Drucker, "Improving Regressors using Boosting Techniques", 1997. `_ diff --git a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py index 0dde24116065d..853caec241491 100644 --- a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py +++ b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py @@ -22,7 +22,9 @@ the predicted value. RFs, on the other hand, are based on bagging and use a majority vote to predict the outcome. -For more information on ensemble models, see the :ref:`User Guide `. +See the :ref:`User Guide ` for more information on ensemble models or +see :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an +example showcasing some other features of HGBT models. """ # Author: Arturo Amor diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py index 0dd0a84243b4d..d9566f19a8214 100644 --- a/examples/ensemble/plot_gradient_boosting_categorical.py +++ b/examples/ensemble/plot_gradient_boosting_categorical.py @@ -21,6 +21,10 @@ We will work with the Ames Lowa Housing dataset which consists of numerical and categorical features, where the houses' sales prices is the target. +See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an +example showcasing some other features of +:class:`~ensemble.HistGradientBoostingRegressor`. + """ # %% diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py index d1464ba92c572..41378db704600 100644 --- a/examples/ensemble/plot_gradient_boosting_quantile.py +++ b/examples/ensemble/plot_gradient_boosting_quantile.py @@ -4,7 +4,9 @@ ===================================================== This example shows how quantile regression can be used to create prediction -intervals. +intervals. See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` +for an example showcasing some other features of +:class:`~ensemble.HistGradientBoostingRegressor`. """ diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py index 94705ccfeca24..76437680708be 100644 --- a/examples/ensemble/plot_gradient_boosting_regression.py +++ b/examples/ensemble/plot_gradient_boosting_regression.py @@ -11,7 +11,10 @@ and 500 regression trees of depth 4. Note: For larger datasets (n_samples >= 10000), please refer to -:class:`~sklearn.ensemble.HistGradientBoostingRegressor`. +:class:`~sklearn.ensemble.HistGradientBoostingRegressor`. See +:ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an example +showcasing some other advantages of +:class:`~ensemble.HistGradientBoostingRegressor`. """ diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index c8aaa86f71d0b..0040f261ebbbd 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -346,7 +346,8 @@ # - collecting more data-points (in case the model is overfitting); # - better tuning of the model hyperparameters (see # :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`); -# - engineering more predictive features from the same data. +# - engineering more predictive features from the same data (see +# :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`). # # Monotonic Constraints # ===================== diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py index 7c6836632e3f0..7753f8653799e 100644 --- a/examples/release_highlights/plot_release_highlights_0_23_0.py +++ b/examples/release_highlights/plot_release_highlights_0_23_0.py @@ -122,7 +122,8 @@ # specific features. In the following example, we construct a target that is # generally positively correlated with the first feature, with some noise. # Applying monotoinc constraints allows the prediction to capture the global -# effect of the first feature, instead of fitting the noise. +# effect of the first feature, instead of fitting the noise. For a usecase +# example, see :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`. import numpy as np from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split diff --git a/examples/release_highlights/plot_release_highlights_1_1_0.py b/examples/release_highlights/plot_release_highlights_1_1_0.py index 088919565315a..63c22d2f22fe5 100644 --- a/examples/release_highlights/plot_release_highlights_1_1_0.py +++ b/examples/release_highlights/plot_release_highlights_1_1_0.py @@ -53,6 +53,9 @@ ax.plot(X_1d, hist.predict(X), label=quantile) _ = ax.legend(loc="lower left") +# %% +# For a usecase example, see +# :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` # %% # `get_feature_names_out` Available in all Transformers diff --git a/examples/release_highlights/plot_release_highlights_1_3_0.py b/examples/release_highlights/plot_release_highlights_1_3_0.py index 8fa1ea057ac91..993a91d18100a 100644 --- a/examples/release_highlights/plot_release_highlights_1_3_0.py +++ b/examples/release_highlights/plot_release_highlights_1_3_0.py @@ -88,7 +88,9 @@ # :class:`tree.DecisionTreeRegressor` now support missing values. For each potential # threshold on the non-missing data, the splitter will evaluate the split with all the # missing values going to the left node or the right node. -# More details in the :ref:`User Guide `. +# See more details in the :ref:`User Guide ` or see +# :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for a usecase +# example of this feature in :class:`~ensemble.HistGradientBoostingRegressor`. import numpy as np from sklearn.tree import DecisionTreeClassifier From 1d56abdda9cc1c5fcdf88170f1dfb3ccc00eabb1 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 2 Aug 2023 18:03:17 +0200 Subject: [PATCH 06/52] Use dictionary to define monotonic_cst --- examples/ensemble/plot_hgbt_regression.py | 25 +++++++++++++---------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 0040f261ebbbd..4e7deb0e6eb11 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -365,20 +365,23 @@ # price and demand in New South Wales, but also decrease with price and demand # in Victoria, in order to benefit both populations. # -# To create the monotonic constraints, we use :class:`numpy.select` to assign -# `1` to the positions corresponding to columns "nswdemand" and "nswprice", `-1` -# to the positions corresponding to columns "vicdemand" and "vicprice", and `0` -# elsewhere. We then visualize the partial dependence on said features: +# If the training data has feature names, it’s possible to specify the monotonic +# constraints by passing a dictionary with the convention: +# - 1: monotonic increase +# - 0: no constraint +# - -1: monotonic decrease from sklearn.inspection import PartialDependenceDisplay -conditions = [ - (X.columns == "nswdemand") | (X.columns == "nswprice"), - (X.columns == "vicdemand") | (X.columns == "vicprice"), -] -choices = [1, -1] -monotonic_cst = np.select(conditions, choices, default=0) - +monotonic_cst = { + "date": 0, + "day": 0, + "period": 0, + "nswdemand": 1, + "nswprice": 1, + "vicdemand": -1, + "vicprice": -1, +} gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y) gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y) From ff89b7c968f6e1d5e8e53fd048ea956bba303aa3 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 3 Aug 2023 11:17:32 +0200 Subject: [PATCH 07/52] Add cross-links in the documentation --- doc/modules/ensemble.rst | 8 +++++++- examples/ensemble/plot_hgbt_regression.py | 1 + .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 0585f8289ed55..711cbb6c1f891 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -80,7 +80,8 @@ are not yet supported, for instance some loss functions. .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py` + * :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py` + * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py` Usage ^^^^^ @@ -169,6 +170,10 @@ If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples. +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` + .. _sw_hgbdt: Sample weight support @@ -317,6 +322,7 @@ Also, monotonic constraints are not supported for multiclass classification. .. topic:: Examples: * :ref:`sphx_glr_auto_examples_ensemble_plot_monotonic_constraints.py` + * :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` .. _interaction_cst_hgbt: diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 4e7deb0e6eb11..42e516fb96cbf 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -367,6 +367,7 @@ # # If the training data has feature names, it’s possible to specify the monotonic # constraints by passing a dictionary with the convention: +# # - 1: monotonic increase # - 0: no constraint # - -1: monotonic decrease diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 5d030d3add5bb..9d6b22b6519f1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1200,6 +1200,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): assigned to the left or right child consequently. If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples. + See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for a + usecase example of this feature. This implementation is inspired by `LightGBM `_. From 543d2803d11c5851ccd9717f00b430561b78ad43 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 3 Aug 2023 11:27:04 +0200 Subject: [PATCH 08/52] Change title --- examples/ensemble/plot_hgbt_regression.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 42e516fb96cbf..1fedd06f9af21 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -1,7 +1,7 @@ """ -=========================================================== -Decision Tree Regression with HistGradientBoostingRegressor -=========================================================== +============================================================= +Usecase of advanced features in Histogram Boosting Regression +============================================================= :ref:`histogram_based_gradient_boosting` (HGBT) models can be a competitive alternative to random forests, especially when the number of samples is larger From b77ab5c152c669282a42e43b8c155a3c02841038 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:43:37 +0200 Subject: [PATCH 09/52] Apply suggestions from code review Co-authored-by: Christian Lorentzen --- examples/ensemble/plot_hgbt_regression.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 1fedd06f9af21..b07ae7dcca05f 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -3,20 +3,25 @@ Usecase of advanced features in Histogram Boosting Regression ============================================================= -:ref:`histogram_based_gradient_boosting` (HGBT) models can be a competitive -alternative to random forests, especially when the number of samples is larger +:ref:`histogram_based_gradient_boosting` (HGBT) may be the most useful supervised learning model in scikit-learn. It is a modern gradient boosting implementation +comparable to LightGBM and XGBoost. As such, it is more feature rich than and often +outperforms alternative models like random forests, especially when the number of samples is larger than tens of thousands of samples (see :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`). -HGBT models have additional advantages such as: +The top usability features of HGBT models are: - :ref:`categorical_support_gbdt` (see :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). - :ref:`nan_support_hgbt`, which avoids the need for an imputer. - :ref:`Quantile loss support `. - :ref:`monotonic_cst_gbdt`. +- :ref:`_interaction_cst_hgbt`. +- early stopping -This example aims at showcasing the last three points in a real setting. +Note that random forests have none of those capabilities. + +This example aims at showcasing points 2-4 in a real life setting. """ # %% @@ -287,8 +292,8 @@ # Support for quantile loss # ========================= # -# The quantile loss in regression enables a view of the potential variability in -# predictions. For instance, predicting the 5th and 95th percentiles can provide +# The quantile loss in regression enables a view of the variability or uncertainty +# of the target variable. For instance, predicting the 5th and 95th percentiles can provide # a 90% prediction interval, i.e. the range within which we expect the true # value to fall with 90% probability. @@ -349,7 +354,7 @@ # - engineering more predictive features from the same data (see # :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`). # -# Monotonic Constraints +# Monotonic constraints # ===================== # # Given specific domain knowledge that requires the relationship between a From 4689b0f890a6d455e9a74757b88821f0f69ecf90 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 7 Sep 2023 11:52:24 +0200 Subject: [PATCH 10/52] Iter on suggestions from code-review --- examples/ensemble/plot_hgbt_regression.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index b07ae7dcca05f..afa1a5dac953b 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -3,10 +3,12 @@ Usecase of advanced features in Histogram Boosting Regression ============================================================= -:ref:`histogram_based_gradient_boosting` (HGBT) may be the most useful supervised learning model in scikit-learn. It is a modern gradient boosting implementation -comparable to LightGBM and XGBoost. As such, it is more feature rich than and often -outperforms alternative models like random forests, especially when the number of samples is larger -than tens of thousands of samples (see +:ref:`histogram_based_gradient_boosting` (HGBT) models may be the most useful +supervised learning models in scikit-learn. They are based on a modern gradient +boosting implementation comparable to LightGBM and XGBoost. As such, HGBT models +are more feature rich than -and often outperforms- alternative models like +random forests, especially when the number of samples is larger than tens of +thousands of samples (see :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`). The top usability features of HGBT models are: @@ -292,10 +294,10 @@ # Support for quantile loss # ========================= # -# The quantile loss in regression enables a view of the variability or uncertainty -# of the target variable. For instance, predicting the 5th and 95th percentiles can provide -# a 90% prediction interval, i.e. the range within which we expect the true -# value to fall with 90% probability. +# The quantile loss in regression enables a view of the variability or +# uncertainty of the target variable. For instance, predicting the 5th and 95th +# percentiles can provide a 90% prediction interval, i.e. the range within which +# we expect the true value to fall with 90% probability. from sklearn.metrics import make_scorer, mean_pinball_loss From 86f8f6785161a6ceea99889d3fd40eb85648f5e5 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Tue, 3 Oct 2023 12:18:51 +0200 Subject: [PATCH 11/52] Remove comment that will no longer be true in v1.4 --- examples/ensemble/plot_hgbt_regression.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index afa1a5dac953b..551ddf1e5ff63 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -21,8 +21,6 @@ - :ref:`_interaction_cst_hgbt`. - early stopping -Note that random forests have none of those capabilities. - This example aims at showcasing points 2-4 in a real life setting. """ From 35c065ad56b8bccf1496c2842ff8cf84d55a8ef6 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Tue, 3 Oct 2023 12:26:55 +0200 Subject: [PATCH 12/52] Address comment from Christian on calibration --- examples/ensemble/plot_hgbt_regression.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 551ddf1e5ff63..8e9935b9efa0e 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -346,7 +346,8 @@ _ = ax.legend() # %% -# Keep in mind that one can still improve the calibration of our model by: +# Keep in mind that the predicted percentiles are just estimations that depend +# on the model. One can still improve the quality of such estimations by: # # - collecting more data-points (in case the model is overfitting); # - better tuning of the model hyperparameters (see From c3e01fc768da1aa2a775c45e3655399fcae74878 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Tue, 3 Oct 2023 12:39:03 +0200 Subject: [PATCH 13/52] Address comment from Christian on bias --- examples/ensemble/plot_hgbt_regression.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 8e9935b9efa0e..85eacafd09f37 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -361,8 +361,9 @@ # Given specific domain knowledge that requires the relationship between a # feature and the target to be monotonically increasing or decreasing, one can # enforce such behaviour in the predictions of a HGBT model using monotonic -# constraints. This makes the model more interpretable and prevents overfitting. -# Monotonic constraints can also be used to enforce specific regulatory +# constraints. This makes the model more interpretable and can reduce its +# variance (and potentially mitigate overfitting) at the risk of increasing +# bias. Monotonic constraints can also be used to enforce specific regulatory # requirements, ensure compliance and align with ethical considerations. # # In the present example, the policy of transfering energy from Victoria to New From 093b8dd903132e4201f7b8ed3c4f82fba654c888 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 4 Oct 2023 14:13:14 +0200 Subject: [PATCH 14/52] Apply suggestions from code review Co-authored-by: Christian Lorentzen --- examples/ensemble/plot_hgbt_regression.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 85eacafd09f37..73aacb2a0356f 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -16,7 +16,7 @@ - :ref:`categorical_support_gbdt` (see :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). - :ref:`nan_support_hgbt`, which avoids the need for an imputer. -- :ref:`Quantile loss support `. +- Support for several losses such as the :ref:`Quantile loss `. - :ref:`monotonic_cst_gbdt`. - :ref:`_interaction_cst_hgbt`. - early stopping @@ -346,10 +346,10 @@ _ = ax.legend() # %% -# Keep in mind that the predicted percentiles are just estimations that depend -# on the model. One can still improve the quality of such estimations by: +# Keep in mind that those predicted percentiles are just estimations from a +# model. One can still improve the quality of such estimations by: # -# - collecting more data-points (in case the model is overfitting); +# - collecting more data-points; # - better tuning of the model hyperparameters (see # :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`); # - engineering more predictive features from the same data (see From ff2888f02ac7dfa4034ffa77204c9f230da80bb2 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 4 Oct 2023 14:15:48 +0200 Subject: [PATCH 15/52] Iter on suggestions --- examples/ensemble/plot_hgbt_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 73aacb2a0356f..df4cf837f8ef4 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -18,7 +18,7 @@ - :ref:`nan_support_hgbt`, which avoids the need for an imputer. - Support for several losses such as the :ref:`Quantile loss `. - :ref:`monotonic_cst_gbdt`. -- :ref:`_interaction_cst_hgbt`. +- :ref:`interaction_cst_hgbt`. - early stopping This example aims at showcasing points 2-4 in a real life setting. From 74719599c77be61883e56331336de624edd975c3 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 6 Oct 2023 14:36:19 +0200 Subject: [PATCH 16/52] Silence warning from DataFrame.groupby --- examples/ensemble/plot_hgbt_regression.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index df4cf837f8ef4..c9c031ea11125 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -90,7 +90,7 @@ max_iter_list = [10, 50] fig, ax = plt.subplots(figsize=(10, 5)) -average_week_demand = df.groupby(["day", "period"])["transfer"].mean() +average_week_demand = df.groupby(["day", "period"], observed=False)["transfer"].mean() average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax) for idx, max_iter in enumerate(max_iter_list): @@ -99,7 +99,9 @@ y_pred = hgbt.predict(X) prediction_df = df.copy() prediction_df["y_pred"] = y_pred - average_pred = prediction_df.groupby(["day", "period"])["y_pred"].mean() + average_pred = prediction_df.groupby(["day", "period"], observed=False)[ + "y_pred" + ].mean() average_pred.plot( color=colors[idx + 1], label=f"max_iter={max_iter}", linewidth=2, ax=ax ) From 9a486b896a5661ddcc77db31601318303b131310 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 6 Oct 2023 16:04:50 +0200 Subject: [PATCH 17/52] Add discussion on early stopping --- examples/ensemble/plot_hgbt_regression.py | 67 +++++++++++++++++++---- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index c9c031ea11125..7553eb631ede2 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -15,13 +15,13 @@ - :ref:`categorical_support_gbdt` (see :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). +- Early stopping. - :ref:`nan_support_hgbt`, which avoids the need for an imputer. - Support for several losses such as the :ref:`Quantile loss `. - :ref:`monotonic_cst_gbdt`. - :ref:`interaction_cst_hgbt`. -- early stopping -This example aims at showcasing points 2-4 in a real life setting. +This example aims at showcasing points 2-5 in a real life setting. """ # %% @@ -78,8 +78,8 @@ # %% # Notice energy transfer increases systematically during weekends. # -# Effect of number of trees in HistGradientBoostingRegressor -# ========================================================== +# Effect of number of trees and early stopping +# ============================================ # For the sake of illustrating the effect of the (maximum) number of trees, we # train a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` over the # daily electricity transfer using the whole dataset. Then we visualize its @@ -118,6 +118,52 @@ # With just a few iterations, HGBT models can achieve convergence (see # :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`). # +# Instead of relying solely on `max_iter` to determine when to stop, the HGBT +# implementations in scikit-learn support early stopping. With it, the model +# uses a fraction of the training data as a validation set +# (`validation_fraction`) and stops training if the validation score does not +# improve (or degrades) after `n_iter_no_change` iterations up to a certain +# `tol`. +# +# Notice that there is a trade-off between `learning_rate` and `max_iter`: +# Generally, smaller learning rates require more iterations to converge to the +# minimum loss, while larger learning rates might converge faster but are at +# risk of overfitting. +# +# Indeed, a good practice is to tune the learning rate along with any other +# hyperparameters, fit the HBGT on the training set with a large enough value +# for `max_iter` and determine the best `max_iter` via early stopping and some +# explicit `validation_fraction`. + +common_params = { + "max_iter": 1_000, + "learning_rate": 0.3, + "validation_fraction": 0.2, + "random_state": 42, + "scoring": "neg_root_mean_squared_error", +} + +hgbt = HistGradientBoostingRegressor(early_stopping=True, **common_params) +hgbt.fit(X, y) +plt.plot(-hgbt.validation_score_) +plt.xlabel("number of iterations") +plt.ylabel("root mean squared error") +_ = plt.title(f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})") + +# %% +# We can then overwrite the value for `max_iter` to a razonable value and avoid +# the extra computational cost of the inner validation. In this case, rounding +# up the number of iterations to 600 may account for variability of the training +# set: + +common_params["max_iter"] = 600 +common_params["early_stopping"] = False +hgbt = HistGradientBoostingRegressor(**common_params) + +# %% +# .. note:: The inner validation done during early stopping is not optimal for +# time series with the implementation as of scikit-learn v1.3. +# # Support for missing values # ========================== # HGBT models have native support of missing values. During training, the tree @@ -148,7 +194,6 @@ fig, ax = plt.subplots(figsize=(12, 6)) ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") -hgbt = HistGradientBoostingRegressor() for missing_fraction in missing_fraction_list: num_missing_cells = int(total_cells * missing_fraction) @@ -203,7 +248,6 @@ X["vicprice"] = X["vicprice"].mask(mask, np.nan) X["vicdemand"] = X["vicdemand"].mask(mask, np.nan) - hgbt = HistGradientBoostingRegressor() hgbt.fit(X.iloc[train_0], y.iloc[train_0]) hgbt_predictions = hgbt.predict(X.iloc[test_0]) cv_results = cross_validate( @@ -262,7 +306,6 @@ X["vicprice"] = X["vicprice"].mask(mask, np.nan) X["vicdemand"] = X["vicdemand"].mask(mask, np.nan) - hgbt = HistGradientBoostingRegressor() hgbt.fit(X.iloc[train_0], y.iloc[train_0]) hgbt_predictions = hgbt.predict(X.iloc[test_0]) cv_results = cross_validate( @@ -309,13 +352,15 @@ ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") for quantile in quantiles: - hgbt = HistGradientBoostingRegressor(loss="quantile", quantile=quantile) - hgbt.fit(X.iloc[train_0], y.iloc[train_0]) - hgbt_predictions = hgbt.predict(X.iloc[test_0]) + hgbt_quantile = HistGradientBoostingRegressor( + loss="quantile", quantile=quantile, **common_params + ) + hgbt_quantile.fit(X.iloc[train_0], y.iloc[train_0]) + hgbt_predictions = hgbt_quantile.predict(X.iloc[test_0]) predictions.append(hgbt_predictions) cv_results = cross_validate( - hgbt, + hgbt_quantile, X, y, cv=ts_cv, From 822f3db0a0b4ae538823215242890b8910708bf9 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 6 Oct 2023 16:05:20 +0200 Subject: [PATCH 18/52] Wording --- examples/ensemble/plot_hgbt_regression.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 7553eb631ede2..706532edd5d86 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -425,6 +425,8 @@ # - 1: monotonic increase # - 0: no constraint # - -1: monotonic decrease +# +# Else, one can pass an array-like encoding the above convention by position. from sklearn.inspection import PartialDependenceDisplay @@ -472,8 +474,8 @@ _ = plt.legend() # %% -# Indeed, we can verify that the predictive quality of the model is not degraded -# by introducing the monotonic constraints: +# Indeed, we can verify that the predictive quality of the model is not +# significantly degraded by introducing the monotonic constraints: cv_results = cross_validate( gbdt_no_cst, @@ -494,3 +496,8 @@ ) rmse = -cv_results["test_score"] print(f"RMSE with constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}") + +# %% +# That being said, notice the comparison is between to different models that may +# be optimized by a different combination of hyperparameters. That is the reason +# why we do no use the `common_params` in this section as done before. From 97cf6426a566a241dc72b28477c19a63f4ed7360 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 6 Oct 2023 16:10:13 +0200 Subject: [PATCH 19/52] Rename instances of hgbt --- examples/ensemble/plot_hgbt_regression.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 706532edd5d86..a8d50a5a4f8a7 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -439,33 +439,33 @@ "vicdemand": -1, "vicprice": -1, } -gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y) -gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y) +hgbt_no_cst = HistGradientBoostingRegressor().fit(X, y) +hgbt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y) fig, ax = plt.subplots(nrows=2, figsize=(15, 10)) disp = PartialDependenceDisplay.from_estimator( - gbdt_no_cst, + hgbt_no_cst, X, features=["nswdemand", "nswprice"], line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"}, ax=ax[0], ) PartialDependenceDisplay.from_estimator( - gbdt_cst, + hgbt_cst, X, features=["nswdemand", "nswprice"], line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"}, ax=disp.axes_, ) disp = PartialDependenceDisplay.from_estimator( - gbdt_no_cst, + hgbt_no_cst, X, features=["vicdemand", "vicprice"], line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"}, ax=ax[1], ) PartialDependenceDisplay.from_estimator( - gbdt_cst, + hgbt_cst, X, features=["vicdemand", "vicprice"], line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"}, @@ -478,7 +478,7 @@ # significantly degraded by introducing the monotonic constraints: cv_results = cross_validate( - gbdt_no_cst, + hgbt_no_cst, X, y, cv=ts_cv, @@ -488,7 +488,7 @@ print(f"RMSE without constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}") cv_results = cross_validate( - gbdt_cst, + hgbt_cst, X, y, cv=ts_cv, From 60d8f6118b87cfbd3e336e773a28f59de05714b4 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 6 Oct 2023 16:17:08 +0200 Subject: [PATCH 20/52] Remove distinction on type of missingness --- examples/ensemble/plot_hgbt_regression.py | 114 +--------------------- 1 file changed, 4 insertions(+), 110 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index a8d50a5a4f8a7..cde47747de12d 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -173,12 +173,10 @@ # values during training, samples with missing values for that feature are sent # to the child with the most samples. # -# Missing Completely At Random (MCAR) -# ----------------------------------- -# -# The missingness does not depend on the observed data or the unobserved data. -# It's completely random. We can simulate such scenario by randomly replacing -# values from randomly selected features with `Nan` values. +# The present example shows how HGBT regressions deal with values missing +# completely at random (MCAR), i.e. the missingness does not depend on the +# observed data or the unobserved data. We can simulate such scenario by +# randomly replacing values from randomly selected features with `Nan` values. import numpy as np @@ -229,110 +227,6 @@ ) _ = ax.legend() -# %% -# Missing At Random (MAR) -# ----------------------- -# -# The missingness depends on the observed data but never on unobserved data. -# Here, the missingness in "vicdemand" is set to depend on the value of the -# observed feature "nswprice". - -missing_fraction_list = [0, 0.5, 1.0] - -fig, ax = plt.subplots(figsize=(12, 6)) -ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") - -for missing_fraction in missing_fraction_list: - X = df.drop(columns=["transfer", "class"]) - mask = X["nswprice"] < X["nswprice"].quantile(missing_fraction) - X["vicprice"] = X["vicprice"].mask(mask, np.nan) - X["vicdemand"] = X["vicdemand"].mask(mask, np.nan) - - hgbt.fit(X.iloc[train_0], y.iloc[train_0]) - hgbt_predictions = hgbt.predict(X.iloc[test_0]) - cv_results = cross_validate( - hgbt, - X, - y, - cv=ts_cv, - scoring="neg_root_mean_squared_error", - ) - rmse = -cv_results["test_score"] - ax.plot( - hgbt_predictions[last_days], - label=( - f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-" - f" {rmse.std():.2f}" - ), - alpha=0.5, - ) -ax.set( - title="Daily energy transfer predictions on data with MAR values", - xticks=[(i + 0.25) * 48 for i in range(4)], - xticklabels=["Tue", "Wed", "Thu", "Fri"], - xlabel="Time of the week", - ylabel="Normalized energy transfer", -) -_ = ax.legend() - -# %% -# In this case the features are highly correlated and therefore MAR values -# do not degrade the predictivity of the model even when completely removing -# the feature "vicprice". -# -# Missing Not At Random (MNAR) -# ---------------------------- -# -# The missingness depends on the unobserved data. In particular, if the -# probability of a value being missing in a variable is dependent on the values -# of that variable itself. Here, we set the missingness to depend on the -# unobserved feature "class". - -import pandas as pd - -fig, ax = plt.subplots(figsize=(12, 6)) -ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") - -for missing_fraction in missing_fraction_list: - X = df.drop(columns=["transfer", "class"]) # reset X - mask = df["class"] == "DOWN" - true_indices = mask[mask].index - n_keep = int(len(true_indices) * missing_fraction) - keep_indices = np.random.choice(true_indices, size=n_keep, replace=False) - mask = pd.Series(False, index=mask.index) - - # Set the randomly selected true indices to True in the new mask - mask.loc[keep_indices] = True - X["vicprice"] = X["vicprice"].mask(mask, np.nan) - X["vicdemand"] = X["vicdemand"].mask(mask, np.nan) - - hgbt.fit(X.iloc[train_0], y.iloc[train_0]) - hgbt_predictions = hgbt.predict(X.iloc[test_0]) - cv_results = cross_validate( - hgbt, - X, - y, - cv=ts_cv, - scoring="neg_root_mean_squared_error", - ) - rmse = -cv_results["test_score"] - ax.plot( - hgbt_predictions[last_days], - label=( - f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-" - f" {rmse.std():.2f}" - ), - alpha=0.5, - ) -ax.set( - title="Daily energy transfer predictions on data with MNAR values", - xticks=[(i + 0.25) * 48 for i in range(4)], - xticklabels=["Tue", "Wed", "Thu", "Fri"], - xlabel="Time of the week", - ylabel="Normalized energy transfer", -) -_ = ax.legend() - # %% # Support for quantile loss # ========================= From 8799932ad3bd8e19827bf3215adb73aafb7ce994 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:17:11 +0100 Subject: [PATCH 21/52] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- examples/ensemble/plot_hgbt_regression.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index cde47747de12d..6e548a54329ad 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -24,11 +24,11 @@ This example aims at showcasing points 2-5 in a real life setting. """ -# %% + # Author: Arturo Amor -# # License: BSD 3 clause -# + +# %% # Preparing the data # ================== # The `electricity dataset `_ consists of data @@ -40,7 +40,7 @@ # The dataset (originally named ELEC2) contains 45,312 instances dated from 7 # May 1996 to 5 December 1998. Each example of the dataset refers to a period of # 30 minutes, i.e. there are 48 instances for each time period of one day. Each -# example on the dataset has 5 fields, the day of week, the time stamp, the New +# example on the dataset has 5 fields: the day of week, the time stamp, the New # South Wales electricity demand, the Victoria electricity demand. It is # originally a classification task, but here we use it as a regression where the # target is the scheduled electricity transfer between states. @@ -151,7 +151,7 @@ _ = plt.title(f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})") # %% -# We can then overwrite the value for `max_iter` to a razonable value and avoid +# We can then overwrite the value for `max_iter` to a reasonable value and avoid # the extra computational cost of the inner validation. In this case, rounding # up the number of iterations to 600 may account for variability of the training # set: From c3c883cbb32d3309229f4c54a9239633d8cabac0 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 2 Nov 2023 11:20:06 +0100 Subject: [PATCH 22/52] Use numbered list --- examples/ensemble/plot_hgbt_regression.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 6e548a54329ad..70f063c73b9be 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -13,13 +13,14 @@ The top usability features of HGBT models are: -- :ref:`categorical_support_gbdt` (see +1. :ref:`categorical_support_gbdt` (see :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). -- Early stopping. -- :ref:`nan_support_hgbt`, which avoids the need for an imputer. -- Support for several losses such as the :ref:`Quantile loss `. -- :ref:`monotonic_cst_gbdt`. -- :ref:`interaction_cst_hgbt`. +1. Early stopping. +1. :ref:`nan_support_hgbt`, which avoids the need for an imputer. +1. Support for several losses such as the :ref:`Quantile loss + `. +1. :ref:`monotonic_cst_gbdt`. +1. :ref:`interaction_cst_hgbt`. This example aims at showcasing points 2-5 in a real life setting. """ From 26ddf3baff54384447621abab683dc34c711d71e Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 9 Nov 2023 16:29:27 +0100 Subject: [PATCH 23/52] Prefer lineplot instead of pairplot --- examples/ensemble/plot_hgbt_regression.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 70f063c73b9be..a9d4c35b8dc39 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -65,13 +65,11 @@ colors = sns.color_palette("colorblind") fig, ax = plt.subplots(figsize=(15, 10)) -pointplot = sns.pointplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax) +pointplot = sns.lineplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax) handles, lables = ax.get_legend_handles_labels() ax.set( title="Hourly energy transfer for different days of the week", - xticks=[i * 2 for i in range(24)], - xticklabels=list(range(24)), - xlabel="Time of the day", + xlabel="Normalized time of the day", ylabel="Normalized energy transfer", ) _ = ax.legend(handles, ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]) From 4d700387724bc7d7d37645c994d52734227831a6 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 9 Nov 2023 16:31:34 +0100 Subject: [PATCH 24/52] Prefer sample over example --- examples/ensemble/plot_hgbt_regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index a9d4c35b8dc39..9d215892c744d 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -39,9 +39,9 @@ # Victoria were done to alleviate fluctuations. # # The dataset (originally named ELEC2) contains 45,312 instances dated from 7 -# May 1996 to 5 December 1998. Each example of the dataset refers to a period of +# May 1996 to 5 December 1998. Each sample of the dataset refers to a period of # 30 minutes, i.e. there are 48 instances for each time period of one day. Each -# example on the dataset has 5 fields: the day of week, the time stamp, the New +# sample on the dataset has 5 fields: the day of week, the time stamp, the New # South Wales electricity demand, the Victoria electricity demand. It is # originally a classification task, but here we use it as a regression where the # target is the scheduled electricity transfer between states. From 5b0dcfd175b80d13a6203172a1cbd713f0c3c3ca Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 10 Nov 2023 11:51:20 +0100 Subject: [PATCH 25/52] Remove stepwise constant piece of dataset --- examples/ensemble/plot_hgbt_regression.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 9d215892c744d..e79f19ceb1335 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -25,7 +25,6 @@ This example aims at showcasing points 2-5 in a real life setting. """ - # Author: Arturo Amor # License: BSD 3 clause @@ -57,13 +56,23 @@ X # %% -# Let us explore the hourly electricity transfer over different days of the week: +# This particular dataset has a stepwise constant target for the first 17,760 +# samples: + +y[:17760].unique() + +# %% +# Let us drop those entries and explore the hourly electricity transfer over +# different days of the week: import matplotlib.pyplot as plt import seaborn as sns colors = sns.color_palette("colorblind") +X = X.iloc[17760:] +y = y.iloc[17760:] + fig, ax = plt.subplots(figsize=(15, 10)) pointplot = sns.lineplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax) handles, lables = ax.get_legend_handles_labels() From 29146ae8e63dbed075b103847158653a1e24c62f Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 10 Nov 2023 11:52:25 +0100 Subject: [PATCH 26/52] Plot predictions on unseen data --- examples/ensemble/plot_hgbt_regression.py | 25 +++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index e79f19ceb1335..0babb87f56b4e 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -91,21 +91,29 @@ # For the sake of illustrating the effect of the (maximum) number of trees, we # train a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` over the # daily electricity transfer using the whole dataset. Then we visualize its -# predictions depending on the `max_iter` parameter. +# predictions depending on the `max_iter` parameter. Here we don't try to +# evaluate the performance of the model and its capacity to generalize but +# rather its capacity to learn from the training data. from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.model_selection import train_test_split -max_iter_list = [10, 50] +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False) + +max_iter_list = [5, 50] fig, ax = plt.subplots(figsize=(10, 5)) -average_week_demand = df.groupby(["day", "period"], observed=False)["transfer"].mean() -average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax) +average_week_demand = ( + df.loc[X_test.index].groupby(["day", "period"], observed=False)["transfer"].mean() +) +average_week_demand.plot(color=colors[0], label="ground truth", linewidth=2, ax=ax) for idx, max_iter in enumerate(max_iter_list): hgbt = HistGradientBoostingRegressor(max_iter=max_iter) - hgbt.fit(X, y) - y_pred = hgbt.predict(X) - prediction_df = df.copy() + hgbt.fit(X_train, y_train) + + y_pred = hgbt.predict(X_test) + prediction_df = df.loc[X_test.index].copy() prediction_df["y_pred"] = y_pred average_pred = prediction_df.groupby(["day", "period"], observed=False)[ "y_pred" @@ -113,8 +121,9 @@ average_pred.plot( color=colors[idx + 1], label=f"max_iter={max_iter}", linewidth=2, ax=ax ) + ax.set( - title="Average daily energy transfer during the week", + title="Predicted average energy transfer during the week", xticks=[(i + 0.2) * 48 for i in range(7)], xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"], xlabel="Time of the week", From 25978aeff96746c6a01fd1efdc8e68f18167aac3 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 13 Nov 2023 12:01:40 +0100 Subject: [PATCH 27/52] Refactor code --- examples/ensemble/plot_hgbt_regression.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 0babb87f56b4e..a01806845bc0f 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -68,10 +68,9 @@ import matplotlib.pyplot as plt import seaborn as sns -colors = sns.color_palette("colorblind") - -X = X.iloc[17760:] -y = y.iloc[17760:] +df = electricity.frame.iloc[17760:] +X = df.drop(columns=["transfer", "class"]) +y = df["transfer"] fig, ax = plt.subplots(figsize=(15, 10)) pointplot = sns.lineplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax) @@ -98,14 +97,14 @@ from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.model_selection import train_test_split -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False) - max_iter_list = [5, 50] - -fig, ax = plt.subplots(figsize=(10, 5)) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False) average_week_demand = ( df.loc[X_test.index].groupby(["day", "period"], observed=False)["transfer"].mean() ) + +colors = sns.color_palette("colorblind") +fig, ax = plt.subplots(figsize=(10, 5)) average_week_demand.plot(color=colors[0], label="ground truth", linewidth=2, ax=ax) for idx, max_iter in enumerate(max_iter_list): From 16a19b124f8475c0ceaab24e919e8add3b9e8d21 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 13 Nov 2023 12:02:17 +0100 Subject: [PATCH 28/52] Use train set for determining max_iter --- examples/ensemble/plot_hgbt_regression.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index a01806845bc0f..c4ddadd9fffdf 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -160,7 +160,7 @@ } hgbt = HistGradientBoostingRegressor(early_stopping=True, **common_params) -hgbt.fit(X, y) +hgbt.fit(X_train, y_train) plt.plot(-hgbt.validation_score_) plt.xlabel("number of iterations") plt.ylabel("root mean squared error") @@ -169,10 +169,10 @@ # %% # We can then overwrite the value for `max_iter` to a reasonable value and avoid # the extra computational cost of the inner validation. In this case, rounding -# up the number of iterations to 600 may account for variability of the training +# up the number of iterations to 400 may account for variability of the training # set: -common_params["max_iter"] = 600 +common_params["max_iter"] = 400 common_params["early_stopping"] = False hgbt = HistGradientBoostingRegressor(**common_params) From 70c021f2279b8d202be5b3cd6a68d88eee9dbf38 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 13 Nov 2023 16:38:03 +0100 Subject: [PATCH 29/52] Use test set for plots and add generate_missing_values function --- examples/ensemble/plot_hgbt_regression.py | 101 +++++++++------------- 1 file changed, 43 insertions(+), 58 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index c4ddadd9fffdf..607041397e38f 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -196,52 +196,44 @@ import numpy as np -from sklearn.model_selection import TimeSeriesSplit, cross_validate +from sklearn.metrics import root_mean_squared_error + +rng = np.random.RandomState(42) +first_week = slice(0, 336) # first week in the test set as 7 * 48 = 336 +missing_fraction_list = [0, 0.02, 0.05] -np.random.seed(42) -ts_cv = TimeSeriesSplit(n_splits=5, gap=48, max_train_size=10000, test_size=1000) -train_0, test_0 = next(ts_cv.split(df)) -last_days = slice(-192, None) -total_cells = X.shape[0] * X.shape[1] -missing_fraction_list = [0, 0.01, 0.03] +def generate_missing_values(X, missing_fraction): + total_cells = X.shape[0] * X.shape[1] + num_missing_cells = int(total_cells * missing_fraction) + row_indices = rng.choice(X.shape[0], num_missing_cells, replace=True) + col_indices = rng.choice(X.shape[1], num_missing_cells, replace=True) + X_missing = X.copy() + X_missing.iloc[row_indices, col_indices] = np.nan + return X_missing + fig, ax = plt.subplots(figsize=(12, 6)) -ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") +ax.plot(y_test.values[first_week], label="Actual transfer") for missing_fraction in missing_fraction_list: - num_missing_cells = int(total_cells * missing_fraction) - row_indices = np.random.choice(X.shape[0], num_missing_cells, replace=True) - col_indices = np.random.choice(X.shape[1], num_missing_cells, replace=True) - X = df.drop(columns=["transfer", "class"]) - X.iloc[row_indices, col_indices] = np.nan - - hgbt.fit(X.iloc[train_0], y.iloc[train_0]) - hgbt_predictions = hgbt.predict(X.iloc[test_0]) - cv_results = cross_validate( - hgbt, - X, - y, - cv=ts_cv, - scoring="neg_root_mean_squared_error", - ) - rmse = -cv_results["test_score"] + X_missing = generate_missing_values(X_train, missing_fraction) + hgbt.fit(X_missing, y_train) + y_pred = hgbt.predict(X_test[first_week]) + rmse = root_mean_squared_error(y_test[first_week], y_pred) ax.plot( - hgbt_predictions[last_days], - label=( - f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-" - f" {rmse.std():.2f}" - ), + y_pred[first_week], + label=f"missing_fraction={missing_fraction}, RMSE={rmse:.2f}", alpha=0.5, ) ax.set( title="Daily energy transfer predictions on data with MCAR values", - xticks=[(i + 0.25) * 48 for i in range(4)], - xticklabels=["Tue", "Wed", "Thu", "Fri"], + xticks=[(i + 0.2) * 48 for i in range(7)], + xticklabels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"], xlabel="Time of the week", ylabel="Normalized energy transfer", ) -_ = ax.legend() +_ = ax.legend(loc="lower right") # %% # Support for quantile loss @@ -252,55 +244,44 @@ # percentiles can provide a 90% prediction interval, i.e. the range within which # we expect the true value to fall with 90% probability. -from sklearn.metrics import make_scorer, mean_pinball_loss +from sklearn.metrics import mean_pinball_loss quantiles = [0.95, 0.05] predictions = [] -X = df.drop(columns=["transfer", "class"]) # reset X fig, ax = plt.subplots(figsize=(12, 6)) -ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer") +ax.plot(y_test.values[first_week], label="Actual transfer") for quantile in quantiles: hgbt_quantile = HistGradientBoostingRegressor( loss="quantile", quantile=quantile, **common_params ) - hgbt_quantile.fit(X.iloc[train_0], y.iloc[train_0]) - hgbt_predictions = hgbt_quantile.predict(X.iloc[test_0]) - - predictions.append(hgbt_predictions) - cv_results = cross_validate( - hgbt_quantile, - X, - y, - cv=ts_cv, - scoring=make_scorer(mean_pinball_loss, alpha=quantile), - ) - score = cv_results["test_score"] + hgbt_quantile.fit(X_train, y_train) + y_pred = hgbt_quantile.predict(X_test[first_week]) + + predictions.append(y_pred) + score = mean_pinball_loss(y_test[first_week], y_pred) ax.plot( - hgbt_predictions[last_days], - label=( - f"quantile={quantile}, pinball loss={score.mean():.3f} +/-" - f" {score.std():.3f}" - ), + y_pred[first_week], + label=f"quantile={quantile}, pinball loss={score:.2f}", alpha=0.5, ) ax.fill_between( - range(len(predictions[0][last_days])), - predictions[0][last_days], - predictions[1][last_days], + range(len(predictions[0][first_week])), + predictions[0][first_week], + predictions[1][first_week], color=colors[0], alpha=0.1, ) ax.set( title="Daily energy transfer predictions with quantile loss", - xticks=[(i + 0.25) * 48 for i in range(4)], - xticklabels=["Tue", "Wed", "Thu", "Fri"], + xticks=[(i + 0.2) * 48 for i in range(7)], + xticklabels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"], xlabel="Time of the week", ylabel="Normalized energy transfer", ) -_ = ax.legend() +_ = ax.legend(loc="lower right") # %% # Keep in mind that those predicted percentiles are just estimations from a @@ -387,6 +368,10 @@ # Indeed, we can verify that the predictive quality of the model is not # significantly degraded by introducing the monotonic constraints: +from sklearn.model_selection import TimeSeriesSplit, cross_validate + +ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336) + cv_results = cross_validate( hgbt_no_cst, X, From 5cf52c27b3a328a0e65638036e9bd5db470472a7 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 13 Nov 2023 16:44:53 +0100 Subject: [PATCH 30/52] Reference the problem of coverage --- examples/ensemble/plot_gradient_boosting_quantile.py | 1 + examples/ensemble/plot_hgbt_regression.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py index 41378db704600..a01f0d2d1e8b6 100644 --- a/examples/ensemble/plot_gradient_boosting_quantile.py +++ b/examples/ensemble/plot_gradient_boosting_quantile.py @@ -192,6 +192,7 @@ def highlight_min(x): # (underestimation for this asymmetric noise) but is also naturally robust to # outliers and overfits less. # +# .. _calibration-section: # Calibration of the confidence interval # -------------------------------------- # diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 607041397e38f..66ebc598c40b5 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -284,6 +284,9 @@ def generate_missing_values(X, missing_fraction): _ = ax.legend(loc="lower right") # %% +# We observe a tendence to over-estimate the energy transfer. This could be be +# quantitatively confirmed by computing empirical coverage numbers as done in +# the :ref:`calibration of confidence intervals section `. # Keep in mind that those predicted percentiles are just estimations from a # model. One can still improve the quality of such estimations by: # From 214a0838c2ae182b351dd1f9a25791a41ec9babe Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 13 Nov 2023 16:46:33 +0100 Subject: [PATCH 31/52] Fix typo --- examples/ensemble/plot_hgbt_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 66ebc598c40b5..19f14cc3551f9 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -396,6 +396,6 @@ def generate_missing_values(X, missing_fraction): print(f"RMSE with constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}") # %% -# That being said, notice the comparison is between to different models that may +# That being said, notice the comparison is between two different models that may # be optimized by a different combination of hyperparameters. That is the reason # why we do no use the `common_params` in this section as done before. From 64ff62960b53b36aea06a831ff76258b62258d94 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 14 Nov 2023 15:45:34 +0100 Subject: [PATCH 32/52] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- examples/ensemble/plot_hgbt_regression.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 19f14cc3551f9..fc8b07cce518a 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -14,11 +14,10 @@ The top usability features of HGBT models are: 1. :ref:`categorical_support_gbdt` (see - :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). + :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). 1. Early stopping. 1. :ref:`nan_support_hgbt`, which avoids the need for an imputer. -1. Support for several losses such as the :ref:`Quantile loss - `. +1. Support for several losses such as the :ref:`Quantile loss `. 1. :ref:`monotonic_cst_gbdt`. 1. :ref:`interaction_cst_hgbt`. @@ -59,7 +58,7 @@ # This particular dataset has a stepwise constant target for the first 17,760 # samples: -y[:17760].unique() +y[:17_760].unique() # %% # Let us drop those entries and explore the hourly electricity transfer over @@ -192,7 +191,7 @@ # The present example shows how HGBT regressions deal with values missing # completely at random (MCAR), i.e. the missingness does not depend on the # observed data or the unobserved data. We can simulate such scenario by -# randomly replacing values from randomly selected features with `Nan` values. +# randomly replacing values from randomly selected features with `nan` values. import numpy as np From 604283e0e7b1e6dd10a0db3376f0229f8a23def7 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Tue, 14 Nov 2023 17:13:20 +0100 Subject: [PATCH 33/52] Prefer ax instead of plt to plot --- examples/ensemble/plot_hgbt_regression.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index fc8b07cce518a..1354751f6ba83 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -67,7 +67,7 @@ import matplotlib.pyplot as plt import seaborn as sns -df = electricity.frame.iloc[17760:] +df = electricity.frame.iloc[17_760:] X = df.drop(columns=["transfer", "class"]) y = df["transfer"] @@ -160,10 +160,15 @@ hgbt = HistGradientBoostingRegressor(early_stopping=True, **common_params) hgbt.fit(X_train, y_train) + +_, ax = plt.subplots() plt.plot(-hgbt.validation_score_) -plt.xlabel("number of iterations") -plt.ylabel("root mean squared error") -_ = plt.title(f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})") +ax.set( + xlabel="number of iterations", + ylabel="root mean squared error", + title=f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})", +) +_ = ax.legend() # %% # We can then overwrite the value for `max_iter` to a reasonable value and avoid From 11d165c614a5fd34b8854c91e4cc94f5de44fe3e Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Tue, 14 Nov 2023 17:31:12 +0100 Subject: [PATCH 34/52] Add brief interpretation of plot --- examples/ensemble/plot_hgbt_regression.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 1354751f6ba83..54d5bf35c9f07 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -163,12 +163,11 @@ _, ax = plt.subplots() plt.plot(-hgbt.validation_score_) -ax.set( +_ = ax.set( xlabel="number of iterations", ylabel="root mean squared error", title=f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})", ) -_ = ax.legend() # %% # We can then overwrite the value for `max_iter` to a reasonable value and avoid @@ -204,7 +203,7 @@ rng = np.random.RandomState(42) first_week = slice(0, 336) # first week in the test set as 7 * 48 = 336 -missing_fraction_list = [0, 0.02, 0.05] +missing_fraction_list = [0, 0.01, 0.03] def generate_missing_values(X, missing_fraction): @@ -221,9 +220,10 @@ def generate_missing_values(X, missing_fraction): ax.plot(y_test.values[first_week], label="Actual transfer") for missing_fraction in missing_fraction_list: - X_missing = generate_missing_values(X_train, missing_fraction) - hgbt.fit(X_missing, y_train) - y_pred = hgbt.predict(X_test[first_week]) + X_train_missing = generate_missing_values(X_train, missing_fraction) + X_test_missing = generate_missing_values(X_test, missing_fraction) + hgbt.fit(X_train_missing, y_train) + y_pred = hgbt.predict(X_test_missing[first_week]) rmse = root_mean_squared_error(y_test[first_week], y_pred) ax.plot( y_pred[first_week], @@ -240,6 +240,8 @@ def generate_missing_values(X, missing_fraction): _ = ax.legend(loc="lower right") # %% +# As expected, the model degrades as the proportion of missing values increases. +# # Support for quantile loss # ========================= # From 3abb0c4658159a0ec2746b2a74dc54ac38d1f73b Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Tue, 21 Nov 2023 17:19:46 +0100 Subject: [PATCH 35/52] Revert use of numbered list --- examples/ensemble/plot_hgbt_regression.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 54d5bf35c9f07..dd69757b5ed35 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -13,13 +13,13 @@ The top usability features of HGBT models are: -1. :ref:`categorical_support_gbdt` (see - :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). -1. Early stopping. -1. :ref:`nan_support_hgbt`, which avoids the need for an imputer. -1. Support for several losses such as the :ref:`Quantile loss `. -1. :ref:`monotonic_cst_gbdt`. -1. :ref:`interaction_cst_hgbt`. +- :ref:`categorical_support_gbdt` (see + :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). +- Early stopping. +- :ref:`nan_support_hgbt`, which avoids the need for an imputer. +- Support for several losses such as the :ref:`Quantile loss `. +- :ref:`monotonic_cst_gbdt`. +- :ref:`interaction_cst_hgbt`. This example aims at showcasing points 2-5 in a real life setting. """ From 7c8406820dfaeeb11f39dae1b2226860d183e78a Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Sat, 6 Jan 2024 20:22:17 +0100 Subject: [PATCH 36/52] Apply suggestions from code review Co-authored-by: Christian Lorentzen --- examples/ensemble/plot_hgbt_regression.py | 58 +++++++++++------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index dd69757b5ed35..af43d8ee2ae24 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -1,27 +1,27 @@ """ -============================================================= -Usecase of advanced features in Histogram Boosting Regression -============================================================= +=================================================================== +Use cases of advanced features in Histogram Gradient Boosting Trees +=================================================================== -:ref:`histogram_based_gradient_boosting` (HGBT) models may be the most useful +:ref:`histogram_based_gradient_boosting` (HGBT) models may be one of the most useful supervised learning models in scikit-learn. They are based on a modern gradient boosting implementation comparable to LightGBM and XGBoost. As such, HGBT models -are more feature rich than -and often outperforms- alternative models like -random forests, especially when the number of samples is larger than tens of -thousands of samples (see +are more feature rich than and often outperform alternative models like +random forests, especially when the number of samples is larger than some ten +thousands (see :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`). The top usability features of HGBT models are: -- :ref:`categorical_support_gbdt` (see +1. Several available loss function for mean and quantile regression tasks, see :ref:`Quantile loss `. +2. :ref:`categorical_support_gbdt` (see :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). -- Early stopping. -- :ref:`nan_support_hgbt`, which avoids the need for an imputer. -- Support for several losses such as the :ref:`Quantile loss `. -- :ref:`monotonic_cst_gbdt`. -- :ref:`interaction_cst_hgbt`. +3. Early stopping. +4. :ref:`nan_support_hgbt`, which avoids the need for an imputer. +5. :ref:`monotonic_cst_gbdt`. +6. :ref:`interaction_cst_hgbt`. -This example aims at showcasing points 2-5 in a real life setting. +This example aims at showcasing all points except 2 and 6 in a real life setting. """ # Author: Arturo Amor @@ -41,8 +41,8 @@ # 30 minutes, i.e. there are 48 instances for each time period of one day. Each # sample on the dataset has 5 fields: the day of week, the time stamp, the New # South Wales electricity demand, the Victoria electricity demand. It is -# originally a classification task, but here we use it as a regression where the -# target is the scheduled electricity transfer between states. +# originally a classification task, but here we use it for the regression task +# to predict the scheduled electricity transfer between states. from sklearn.datasets import fetch_openml @@ -104,7 +104,7 @@ colors = sns.color_palette("colorblind") fig, ax = plt.subplots(figsize=(10, 5)) -average_week_demand.plot(color=colors[0], label="ground truth", linewidth=2, ax=ax) +average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax) for idx, max_iter in enumerate(max_iter_list): hgbt = HistGradientBoostingRegressor(max_iter=max_iter) @@ -131,19 +131,19 @@ # %% # With just a few iterations, HGBT models can achieve convergence (see -# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`). +# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`), +# meaning that adding more trees does not improve the model anymore. # -# Instead of relying solely on `max_iter` to determine when to stop, the HGBT -# implementations in scikit-learn support early stopping. With it, the model -# uses a fraction of the training data as a validation set +# Instead of relying on `max_iter` alone to determine when to stop, the HGBT +# implementation in scikit-learn supports early stopping. With it, the model +# uses a fraction of the training data as internal validation set # (`validation_fraction`) and stops training if the validation score does not # improve (or degrades) after `n_iter_no_change` iterations up to a certain # `tol`. # # Notice that there is a trade-off between `learning_rate` and `max_iter`: -# Generally, smaller learning rates require more iterations to converge to the -# minimum loss, while larger learning rates might converge faster but are at -# risk of overfitting. +# Generally, smaller learning rates are preferable but require more iterations to converge to the +# minimum loss, while larger learning rates converge faster (less iterations/trees needed) but at the cost of a larger minimum loss. # # Indeed, a good practice is to tune the learning rate along with any other # hyperparameters, fit the HBGT on the training set with a large enough value @@ -181,7 +181,7 @@ # %% # .. note:: The inner validation done during early stopping is not optimal for -# time series with the implementation as of scikit-learn v1.3. +# time series. # # Support for missing values # ========================== @@ -227,7 +227,7 @@ def generate_missing_values(X, missing_fraction): rmse = root_mean_squared_error(y_test[first_week], y_pred) ax.plot( y_pred[first_week], - label=f"missing_fraction={missing_fraction}, RMSE={rmse:.2f}", + label=f"missing_fraction={missing_fraction}, RMSE={rmse:.3f}", alpha=0.5, ) ax.set( @@ -248,7 +248,7 @@ def generate_missing_values(X, missing_fraction): # The quantile loss in regression enables a view of the variability or # uncertainty of the target variable. For instance, predicting the 5th and 95th # percentiles can provide a 90% prediction interval, i.e. the range within which -# we expect the true value to fall with 90% probability. +# we expect a new observed value to fall with 90% probability. from sklearn.metrics import mean_pinball_loss @@ -389,7 +389,7 @@ def generate_missing_values(X, missing_fraction): scoring="neg_root_mean_squared_error", ) rmse = -cv_results["test_score"] -print(f"RMSE without constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}") +print(f"RMSE without constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}") cv_results = cross_validate( hgbt_cst, @@ -399,7 +399,7 @@ def generate_missing_values(X, missing_fraction): scoring="neg_root_mean_squared_error", ) rmse = -cv_results["test_score"] -print(f"RMSE with constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}") +print(f"RMSE with constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}") # %% # That being said, notice the comparison is between two different models that may From dcdf851d959884454739573dd2becc24dd9793ce Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Sat, 6 Jan 2024 20:48:04 +0100 Subject: [PATCH 37/52] Lint --- examples/ensemble/plot_hgbt_regression.py | 29 +++++++++++++---------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index af43d8ee2ae24..5be4014bcb95f 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -3,17 +3,18 @@ Use cases of advanced features in Histogram Gradient Boosting Trees =================================================================== -:ref:`histogram_based_gradient_boosting` (HGBT) models may be one of the most useful -supervised learning models in scikit-learn. They are based on a modern gradient -boosting implementation comparable to LightGBM and XGBoost. As such, HGBT models -are more feature rich than and often outperform alternative models like -random forests, especially when the number of samples is larger than some ten -thousands (see +:ref:`histogram_based_gradient_boosting` (HGBT) models may be one of the most +useful supervised learning models in scikit-learn. They are based on a modern +gradient boosting implementation comparable to LightGBM and XGBoost. As such, +HGBT models are more feature rich than and often outperform alternative models +like random forests, especially when the number of samples is larger than some +ten thousands (see :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`). The top usability features of HGBT models are: -1. Several available loss function for mean and quantile regression tasks, see :ref:`Quantile loss `. +1. Several available loss function for mean and quantile regression tasks, see + :ref:`Quantile loss `. 2. :ref:`categorical_support_gbdt` (see :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). 3. Early stopping. @@ -21,7 +22,8 @@ 5. :ref:`monotonic_cst_gbdt`. 6. :ref:`interaction_cst_hgbt`. -This example aims at showcasing all points except 2 and 6 in a real life setting. +This example aims at showcasing all points except 2 and 6 in a real life +setting. """ # Author: Arturo Amor @@ -142,8 +144,9 @@ # `tol`. # # Notice that there is a trade-off between `learning_rate` and `max_iter`: -# Generally, smaller learning rates are preferable but require more iterations to converge to the -# minimum loss, while larger learning rates converge faster (less iterations/trees needed) but at the cost of a larger minimum loss. +# Generally, smaller learning rates are preferable but require more iterations +# to converge to the minimum loss, while larger learning rates converge faster +# (less iterations/trees needed) but at the cost of a larger minimum loss. # # Indeed, a good practice is to tune the learning rate along with any other # hyperparameters, fit the HBGT on the training set with a large enough value @@ -402,6 +405,6 @@ def generate_missing_values(X, missing_fraction): print(f"RMSE with constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}") # %% -# That being said, notice the comparison is between two different models that may -# be optimized by a different combination of hyperparameters. That is the reason -# why we do no use the `common_params` in this section as done before. +# That being said, notice the comparison is between two different models that +# may be optimized by a different combination of hyperparameters. That is the +# reason why we do no use the `common_params` in this section as done before. From ab0e21a44d4fbe87ab331d30061d0ebd9455abf5 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Tue, 16 Jan 2024 18:12:01 +0100 Subject: [PATCH 38/52] Fix FutureWarning --- examples/ensemble/plot_hgbt_regression.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 5be4014bcb95f..65448b23d9681 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -109,7 +109,7 @@ average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax) for idx, max_iter in enumerate(max_iter_list): - hgbt = HistGradientBoostingRegressor(max_iter=max_iter) + hgbt = HistGradientBoostingRegressor(max_iter=max_iter, categorical_features=None) hgbt.fit(X_train, y_train) y_pred = hgbt.predict(X_test) @@ -158,6 +158,7 @@ "learning_rate": 0.3, "validation_fraction": 0.2, "random_state": 42, + "categorical_features": None, "scoring": "neg_root_mean_squared_error", } @@ -342,8 +343,10 @@ def generate_missing_values(X, missing_fraction): "vicdemand": -1, "vicprice": -1, } -hgbt_no_cst = HistGradientBoostingRegressor().fit(X, y) -hgbt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y) +hgbt_no_cst = HistGradientBoostingRegressor(categorical_features=None).fit(X, y) +hgbt_cst = HistGradientBoostingRegressor( + monotonic_cst=monotonic_cst, categorical_features=None +).fit(X, y) fig, ax = plt.subplots(nrows=2, figsize=(15, 10)) disp = PartialDependenceDisplay.from_estimator( From c4d1b3b7938ba576eda7e860ee362a34ef9cae8b Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Tue, 16 Jan 2024 18:26:37 +0100 Subject: [PATCH 39/52] List of features as suggested by Christian --- examples/ensemble/plot_hgbt_regression.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 65448b23d9681..ca2a68f018f01 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -41,10 +41,15 @@ # The dataset (originally named ELEC2) contains 45,312 instances dated from 7 # May 1996 to 5 December 1998. Each sample of the dataset refers to a period of # 30 minutes, i.e. there are 48 instances for each time period of one day. Each -# sample on the dataset has 5 fields: the day of week, the time stamp, the New -# South Wales electricity demand, the Victoria electricity demand. It is -# originally a classification task, but here we use it for the regression task -# to predict the scheduled electricity transfer between states. +# sample on the dataset has 7 columns: +# - date: between 7 May 1996 to 5 December 1998. Normalized between 0 and 1; +# - day: day of week (1-7); +# - period: half hour intervals over 24 hours. Normalized between 0 and 1; +# - nswprice/nswdemand: electricity price/demand of New South Wales; +# - vicprice/vicdemand: electricity price/demand of Victoria. +# +# It is originally a classification task, but here we use it for the regression +# task to predict the scheduled electricity transfer between states. from sklearn.datasets import fetch_openml From 49587ab0b826b12acb1e8717f78b2d9d51c50ffa Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 17 Jan 2024 11:22:02 +0100 Subject: [PATCH 40/52] Simplify code Co-authored-by: Christian Lorentzen --- examples/ensemble/plot_hgbt_regression.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index ca2a68f018f01..230c1a20b62ab 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -57,15 +57,12 @@ name="electricity", version=1, as_frame=True, parser="pandas" ) df = electricity.frame -X = df.drop(columns=["transfer", "class"]) -y = df["transfer"] -X # %% # This particular dataset has a stepwise constant target for the first 17,760 # samples: -y[:17_760].unique() +df["transfer"][:17_760,].unique() # %% # Let us drop those entries and explore the hourly electricity transfer over From 42c17427acccf6ffc67fc6d95d49566c3f5f48f3 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 17 Jan 2024 11:50:02 +0100 Subject: [PATCH 41/52] Print simple stats --- examples/ensemble/plot_hgbt_regression.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 230c1a20b62ab..67b15d8be360f 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -62,7 +62,7 @@ # This particular dataset has a stepwise constant target for the first 17,760 # samples: -df["transfer"][:17_760,].unique() +df["transfer"][:17_760].unique() # %% # Let us drop those entries and explore the hourly electricity transfer over @@ -100,12 +100,17 @@ from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.model_selection import train_test_split -max_iter_list = [5, 50] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False) + +print(f"Training sample size: {X_train.shape[0]}") +print(f"Test sample size: {X_test.shape[0]}") +print(f"Number of features: {X_train.shape[1]}") + +# %% +max_iter_list = [5, 50] average_week_demand = ( df.loc[X_test.index].groupby(["day", "period"], observed=False)["transfer"].mean() ) - colors = sns.color_palette("colorblind") fig, ax = plt.subplots(figsize=(10, 5)) average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax) From 37bb831e558c8f16bd1b0557f641340e7cd77c6c Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 17 Jan 2024 15:10:03 +0100 Subject: [PATCH 42/52] Fix indentation --- examples/ensemble/plot_hgbt_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 67b15d8be360f..adce5c6b36e1a 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -16,7 +16,7 @@ 1. Several available loss function for mean and quantile regression tasks, see :ref:`Quantile loss `. 2. :ref:`categorical_support_gbdt` (see - :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). + :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`). 3. Early stopping. 4. :ref:`nan_support_hgbt`, which avoids the need for an imputer. 5. :ref:`monotonic_cst_gbdt`. From d1b809a5e8ba56c4dd5a7126039b935818446c55 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 17 Jan 2024 15:10:57 +0100 Subject: [PATCH 43/52] Use programmatic way to round up n_iter --- examples/ensemble/plot_hgbt_regression.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index adce5c6b36e1a..92efcb53e365a 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -182,11 +182,12 @@ # %% # We can then overwrite the value for `max_iter` to a reasonable value and avoid -# the extra computational cost of the inner validation. In this case, rounding -# up the number of iterations to 400 may account for variability of the training -# set: +# the extra computational cost of the inner validation. Rounding up the number +# of iterations may account for variability of the training set: -common_params["max_iter"] = 400 +import math + +common_params["max_iter"] = math.ceil(hgbt.n_iter_ / 100) * 100 common_params["early_stopping"] = False hgbt = HistGradientBoostingRegressor(**common_params) From 5b1875528d51194a450029e5782896b4058c47d5 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 17 Jan 2024 15:11:25 +0100 Subject: [PATCH 44/52] Set random state for deterministic results --- examples/ensemble/plot_hgbt_regression.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 92efcb53e365a..b6825660a8b56 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -351,9 +351,11 @@ def generate_missing_values(X, missing_fraction): "vicdemand": -1, "vicprice": -1, } -hgbt_no_cst = HistGradientBoostingRegressor(categorical_features=None).fit(X, y) +hgbt_no_cst = HistGradientBoostingRegressor( + categorical_features=None, random_state=42 +).fit(X, y) hgbt_cst = HistGradientBoostingRegressor( - monotonic_cst=monotonic_cst, categorical_features=None + monotonic_cst=monotonic_cst, categorical_features=None, random_state=42 ).fit(X, y) fig, ax = plt.subplots(nrows=2, figsize=(15, 10)) From 9499e611a7b2159c0685a0195cdaad37902f55ac Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 17 Jan 2024 15:24:09 +0100 Subject: [PATCH 45/52] Add explanation on time-aware cross validation --- examples/ensemble/plot_hgbt_regression.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index b6825660a8b56..71a1109db7b84 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -391,11 +391,15 @@ def generate_missing_values(X, missing_fraction): # %% # Indeed, we can verify that the predictive quality of the model is not -# significantly degraded by introducing the monotonic constraints: +# significantly degraded by introducing the monotonic constraints. For such +# purpose we use :class:`~sklearn.model_selection.TimeSeriesSplit` +# cross-validation to estimate the variance of the test score. By doing so we +# guarantee that the training data does not succeed the testing data, which is +# crucial when dealing with data that have a temporal relationship. from sklearn.model_selection import TimeSeriesSplit, cross_validate -ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336) +ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336) # a week has 336 samples cv_results = cross_validate( hgbt_no_cst, From 3b1789e4547db60f0f05a6483e3ad268e4037285 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 17 Jan 2024 15:28:07 +0100 Subject: [PATCH 46/52] Add comment on overcronstraining feature --- examples/ensemble/plot_hgbt_regression.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 71a1109db7b84..8a6e7d4128cd0 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -390,7 +390,10 @@ def generate_missing_values(X, missing_fraction): _ = plt.legend() # %% -# Indeed, we can verify that the predictive quality of the model is not +# Observe that `nswdemand` seems already monotonic without constraint. This is a +# good example to show that the model is "overconstraining". +# +# Additionally, we can verify that the predictive quality of the model is not # significantly degraded by introducing the monotonic constraints. For such # purpose we use :class:`~sklearn.model_selection.TimeSeriesSplit` # cross-validation to estimate the variance of the test score. By doing so we From d972fae3ec6653ef14aa121f6e995503f6fec738 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 23 Jan 2024 10:50:10 +0100 Subject: [PATCH 47/52] Apply suggestion from Guillaume Co-authored-by: Guillaume Lemaitre --- examples/ensemble/plot_hgbt_regression.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 8a6e7d4128cd0..26531bc8fd9a6 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -400,28 +400,18 @@ def generate_missing_values(X, missing_fraction): # guarantee that the training data does not succeed the testing data, which is # crucial when dealing with data that have a temporal relationship. +from sklearn.metrics import make_scorer, root_mean_squared_error from sklearn.model_selection import TimeSeriesSplit, cross_validate ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336) # a week has 336 samples +scorer = make_scorer(root_mean_squared_error) -cv_results = cross_validate( - hgbt_no_cst, - X, - y, - cv=ts_cv, - scoring="neg_root_mean_squared_error", -) -rmse = -cv_results["test_score"] +cv_results = cross_validate(hgbt_no_cst, X, y, cv=ts_cv, scoring=scorer) +rmse = cv_results["test_score"] print(f"RMSE without constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}") -cv_results = cross_validate( - hgbt_cst, - X, - y, - cv=ts_cv, - scoring="neg_root_mean_squared_error", -) -rmse = -cv_results["test_score"] +cv_results = cross_validate(hgbt_cst, X, y, cv=ts_cv, scoring=scorer) +rmse = cv_results["test_score"] print(f"RMSE with constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}") # %% From 9f49ad5affa408b503067018782522019b9c4f9e Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 23 Jan 2024 10:55:27 +0100 Subject: [PATCH 48/52] Update examples/ensemble/plot_adaboost_regression.py Co-authored-by: Christian Lorentzen --- examples/ensemble/plot_adaboost_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py index 98d3699ab161c..916d17addff18 100644 --- a/examples/ensemble/plot_adaboost_regression.py +++ b/examples/ensemble/plot_adaboost_regression.py @@ -10,7 +10,7 @@ detail. See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an -example showcasing the benefits of using more robust regressions such as +example showcasing the benefits of using more efficient regression models such as :class:`~ensemble.HistGradientBoostingRegressor`. .. [1] `H. Drucker, "Improving Regressors using Boosting Techniques", 1997. From d333c6d8dd276e935fe7ce96d68e17e809dc4d14 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Tue, 23 Jan 2024 10:56:10 +0100 Subject: [PATCH 49/52] Format --- examples/ensemble/plot_adaboost_regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py index 916d17addff18..8ba01df63b561 100644 --- a/examples/ensemble/plot_adaboost_regression.py +++ b/examples/ensemble/plot_adaboost_regression.py @@ -10,8 +10,8 @@ detail. See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an -example showcasing the benefits of using more efficient regression models such as -:class:`~ensemble.HistGradientBoostingRegressor`. +example showcasing the benefits of using more efficient regression models such +as :class:`~ensemble.HistGradientBoostingRegressor`. .. [1] `H. Drucker, "Improving Regressors using Boosting Techniques", 1997. `_ From c4a79e64f88e47e708e81f117f55af70deb2010c Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Mon, 19 Feb 2024 14:46:20 +0100 Subject: [PATCH 50/52] Update examples/ensemble/plot_hgbt_regression.py Co-authored-by: Guillaume Lemaitre --- examples/ensemble/plot_hgbt_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 26531bc8fd9a6..796d2d17a76b2 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -325,7 +325,7 @@ def generate_missing_values(X, missing_fraction): # bias. Monotonic constraints can also be used to enforce specific regulatory # requirements, ensure compliance and align with ethical considerations. # -# In the present example, the policy of transfering energy from Victoria to New +# In the present example, the policy of transferring energy from Victoria to New # South Wales is meant to alleviate price fluctuations, meaning that the model # predictions have to enforce such goal, i.e. transfer should increase with # price and demand in New South Wales, but also decrease with price and demand From 1010eccf26314fc7f51e4f217a93c0efa2f60e12 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 19 Feb 2024 14:54:30 +0100 Subject: [PATCH 51/52] Fix random_state --- examples/ensemble/plot_hgbt_regression.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 26531bc8fd9a6..38cad137e35b0 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -116,7 +116,9 @@ average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax) for idx, max_iter in enumerate(max_iter_list): - hgbt = HistGradientBoostingRegressor(max_iter=max_iter, categorical_features=None) + hgbt = HistGradientBoostingRegressor( + max_iter=max_iter, categorical_features=None, random_state=42 + ) hgbt.fit(X_train, y_train) y_pred = hgbt.predict(X_test) From 31db489ec87041b3974528eea3264cdf569562c6 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 19 Feb 2024 14:54:42 +0100 Subject: [PATCH 52/52] Wording as suggested by Guillaume --- examples/ensemble/plot_hgbt_regression.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py index 38cad137e35b0..3d18064e7e489 100644 --- a/examples/ensemble/plot_hgbt_regression.py +++ b/examples/ensemble/plot_hgbt_regression.py @@ -143,7 +143,9 @@ # %% # With just a few iterations, HGBT models can achieve convergence (see # :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`), -# meaning that adding more trees does not improve the model anymore. +# meaning that adding more trees does not improve the model anymore. In the +# figure above, 5 iterations are not enough to be able to predict. With 50 +# iterations, we are already able to do a good job. # # Instead of relying on `max_iter` alone to determine when to stop, the HGBT # implementation in scikit-learn supports early stopping. With it, the model