From 42400749b97a7f4d4ac750a295a86d9b5d9fd7cb Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 2 Aug 2023 11:19:50 +0200
Subject: [PATCH 01/52] DOC Add example showcasing HGBT regression

---
 doc/modules/ensemble.rst                      |   3 +-
 examples/ensemble/plot_hgbt_regression.py     | 448 ++++++++++++++++++
 .../plot_release_highlights_1_1_0.py          |   2 +
 3 files changed, 452 insertions(+), 1 deletion(-)
 create mode 100644 examples/ensemble/plot_hgbt_regression.py

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 36eed98da0f6b..0585f8289ed55 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -129,6 +129,8 @@ Note that for technical reasons, using a scorer is significantly slower than
 using the loss. By default, early-stopping is performed if there are at least
 10,000 samples in the training set, and uses the validation loss.
 
+.. _nan_support_hgbt:
+
 Missing values support
 ^^^^^^^^^^^^^^^^^^^^^^
 
@@ -1634,4 +1636,3 @@ minimum required number of samples to consider a split ``min_samples_split``).
 
  .. [HTF] T. Hastie, R. Tibshirani and J. Friedman, "Elements of
               Statistical Learning Ed. 2", Springer, 2009.
-
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
new file mode 100644
index 0000000000000..ba651563a9d33
--- /dev/null
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -0,0 +1,448 @@
+"""
+===========================================================
+Decision Tree Regression with HistGradientBoostingRegressor
+===========================================================
+
+:ref:`histogram_based_gradient_boosting` (HGBT) models can be a competitive
+alternative to random forests, especially when the number of samples is larger
+than tens of thousands of samples (see
+:ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
+
+HGBT models have additional advantages such as:
+
+- :ref:`categorical_support_gbdt` (see
+  :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`)
+- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
+- :ref:`Quantile loss support <quantile_support_hgbdt>`
+- :ref:`monotonic_cst_gbdt`
+
+This example aims at showcasing the last three points in a real setting.
+"""
+
+# %%
+# Author: Arturo Amor <david-arturo.amor-quiroz@inria.fr>
+#
+# License: BSD 3 clause
+#
+# Preparing the data
+# ==================
+# The `electricity dataset <http://www.openml.org/d/151>`_ consists of data
+# collected from the Australian New South Wales Electricity Market. In this
+# market, prices are not fixed and are affected by supply and demand. They are
+# set every five minutes. Electricity transfers to/from the neighboring state of
+# Victoria were done to alleviate fluctuations.
+#
+# The dataset (originally named ELEC2) contains 45,312 instances dated from 7
+# May 1996 to 5 December 1998. Each example of the dataset refers to a period of
+# 30 minutes, i.e. there are 48 instances for each time period of one day. Each
+# example on the dataset has 5 fields, the day of week, the time stamp, the New
+# South Wales electricity demand, the Victoria electricity demand. It is
+# originally a classification task, but here we use it as a regression where the
+# target is the scheduled electricity transfer between states.
+
+from sklearn.datasets import fetch_openml
+
+electricity = fetch_openml(
+    name="electricity", version=1, as_frame=True, parser="pandas"
+)
+df = electricity.frame
+X = df.drop(columns=["transfer", "class"])
+y = df["transfer"]
+X
+
+# %%
+# Let us explore the hourly electricity transfer over different days of the week:
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+colors = sns.color_palette("colorblind")
+
+fig, ax = plt.subplots(figsize=(15, 10))
+pointplot = sns.pointplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax)
+handles, lables = ax.get_legend_handles_labels()
+ax.set(
+    title="Hourly energy transfer for different days of the week",
+    xticks=[i * 2 for i in range(24)],
+    xticklabels=list(range(24)),
+    xlabel="Time of the day",
+    ylabel="Normalized energy transfer",
+)
+_ = ax.legend(handles, ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"])
+
+# %%
+# Notice energy transfer increases systematically during weekends.
+#
+# Effect of number of trees in HistGradientBoostingRegressor
+# ==========================================================
+# For the sake of illustrating the effect of the (maximum) number of trees, we
+# train a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` over the
+# daily electricity transfer using the whole dataset. Then we visualize its
+# predictions depending on the `max_iter` parameter.
+
+from sklearn.ensemble import HistGradientBoostingRegressor
+
+max_iter_list = [10, 50]
+
+fig, ax = plt.subplots(figsize=(12, 4))
+average_week_demand = df.groupby(["day", "period"])["transfer"].mean()
+average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax)
+
+for idx, max_iter in enumerate(max_iter_list):
+    hgbt = HistGradientBoostingRegressor(max_iter=max_iter)
+    hgbt.fit(X, y)
+    y_pred = hgbt.predict(X)
+    prediction_df = df.copy()
+    prediction_df["y_pred"] = y_pred
+    average_pred = prediction_df.groupby(["day", "period"])["y_pred"].mean()
+    average_pred.plot(
+        color=colors[idx + 1], label=f"max_iter={max_iter}", linewidth=2, ax=ax
+    )
+ax.set(
+    title="Average daily energy transfer during the week",
+    xticks=[(i + 0.2) * 48 for i in range(7)],
+    xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
+    xlabel="Time of the week",
+    ylabel="Normalized energy transfer",
+)
+_ = ax.legend()
+
+# %%
+# With just a few iterations, HGBT models can achieve convergence (see
+# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
+#
+# Support for missing values
+# ==========================
+# HGBT models have native support of missing values. During training, the tree
+# grower decides where samples with missing values should go (left or right
+# child) at each split, based on the potential gain. When predicting, these
+# samples are sent to either child accordingly. If a feature had no missing
+# values during training, samples with missing values for that feature are sent
+# to the child with the most samples.
+#
+# Missing Completely At Random (MCAR)
+# -----------------------------------
+#
+# The missingness does not depend on the observed data or the unobserved data.
+# It's completely random. We can simulate such scenario by randomly replacing
+# values from randomly selected features with `Nan` values.
+
+import numpy as np
+
+from sklearn.model_selection import TimeSeriesSplit, cross_validate
+
+np.random.seed(42)
+
+ts_cv = TimeSeriesSplit(n_splits=5, gap=48, max_train_size=10000, test_size=1000)
+train_0, test_0 = next(ts_cv.split(df))
+last_days = slice(-192, None)
+total_cells = X.shape[0] * X.shape[1]
+missing_fraction_list = [0, 0.01, 0.03]
+
+fig, ax = plt.subplots(figsize=(12, 6))
+ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+hgbt = HistGradientBoostingRegressor()
+
+for missing_fraction in missing_fraction_list:
+    num_missing_cells = int(total_cells * missing_fraction)
+    row_indices = np.random.choice(X.shape[0], num_missing_cells, replace=True)
+    col_indices = np.random.choice(X.shape[1], num_missing_cells, replace=True)
+    X = df.drop(columns=["transfer", "class"])
+    X.iloc[row_indices, col_indices] = np.nan
+
+    hgbt.fit(X.iloc[train_0], y.iloc[train_0])
+    hgbt_predictions = hgbt.predict(X.iloc[test_0])
+    cv_results = cross_validate(
+        hgbt,
+        X,
+        y,
+        cv=ts_cv,
+        scoring="neg_root_mean_squared_error",
+    )
+    rmse = -cv_results["test_score"]
+    ax.plot(
+        hgbt_predictions[last_days],
+        label=(
+            f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
+            f" {rmse.std():.2f}"
+        ),
+        alpha=0.5,
+    )
+ax.set(
+    title="Daily energy transfer predictions on data with MCAR values",
+    xticks=[(i + 0.25) * 48 for i in range(4)],
+    xticklabels=["Tue", "Wed", "Thu", "Fri"],
+    xlabel="Time of the week",
+    ylabel="Normalized energy transfer",
+)
+_ = ax.legend()
+
+# %%
+# Missing At Random (MAR)
+# -----------------------
+#
+# The missingness depends on the observed data but never on unobserved data.
+# Here, the missingness in "vicdemand" is set to depend on the value of the
+# observed feature "nswprice".
+
+missing_fraction_list = [0, 0.5, 1.0]
+
+fig, ax = plt.subplots(figsize=(12, 6))
+ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+
+for missing_fraction in missing_fraction_list:
+    X = df.drop(columns=["transfer", "class"])
+    mask = X["nswprice"] < X["nswprice"].quantile(missing_fraction)
+    X["vicprice"] = X["vicprice"].mask(mask, np.nan)
+    X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
+
+    hgbt = HistGradientBoostingRegressor()
+    hgbt.fit(X.iloc[train_0], y.iloc[train_0])
+    hgbt_predictions = hgbt.predict(X.iloc[test_0])
+    cv_results = cross_validate(
+        hgbt,
+        X,
+        y,
+        cv=ts_cv,
+        scoring="neg_root_mean_squared_error",
+    )
+    rmse = -cv_results["test_score"]
+    ax.plot(
+        hgbt_predictions[last_days],
+        label=(
+            f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
+            f" {rmse.std():.2f}"
+        ),
+        alpha=0.5,
+    )
+ax.set(
+    title="Daily energy transfer predictions on data with MAR values",
+    xticks=[(i + 0.25) * 48 for i in range(4)],
+    xticklabels=["Tue", "Wed", "Thu", "Fri"],
+    xlabel="Time of the week",
+    ylabel="Normalized energy transfer",
+)
+_ = ax.legend()
+
+# %%
+# In this case the features are highly correlated and therefore MAR values
+# do not degrade the predictivity of the model even when completely removing
+# the feature "vicprice".
+#
+# Missing Not At Random (MNAR)
+# ----------------------------
+#
+# The missingness depends on the unobserved data. In particular, if the
+# probability of a value being missing in a variable is dependent on the values
+# of that variable itself. Here, we set the missingness to depend on the
+# unobserved feature "class".
+
+import pandas as pd
+
+fig, ax = plt.subplots(figsize=(12, 6))
+ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+
+for missing_fraction in missing_fraction_list:
+    X = df.drop(columns=["transfer", "class"])
+    mask = df["class"] == "DOWN"
+    true_indices = mask[mask].index
+    n_keep = int(len(true_indices) * missing_fraction)
+    keep_indices = np.random.choice(true_indices, size=n_keep, replace=False)
+    mask = pd.Series(False, index=mask.index)
+
+    # Set the randomly selected true indices to True in the new mask
+    mask.loc[keep_indices] = True
+    X["vicprice"] = X["vicprice"].mask(mask, np.nan)
+    X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
+
+    hgbt = HistGradientBoostingRegressor()
+    hgbt.fit(X.iloc[train_0], y.iloc[train_0])
+    hgbt_predictions = hgbt.predict(X.iloc[test_0])
+    cv_results = cross_validate(
+        hgbt,
+        X,
+        y,
+        cv=ts_cv,
+        scoring="neg_root_mean_squared_error",
+    )
+    rmse = -cv_results["test_score"]
+    ax.plot(
+        hgbt_predictions[last_days],
+        label=(
+            f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
+            f" {rmse.std():.2f}"
+        ),
+        alpha=0.5,
+    )
+ax.set(
+    title="Daily energy transfer predictions on data with MNAR values",
+    xticks=[(i + 0.25) * 48 for i in range(4)],
+    xticklabels=["Tue", "Wed", "Thu", "Fri"],
+    xlabel="Time of the week",
+    ylabel="Normalized energy transfer",
+)
+_ = ax.legend()
+
+# %%
+# Support for quantile loss
+# =========================
+#
+# The quantile loss in regression enables a view of the potential variability in
+# predictions. For instance, predicting the 5th and 95th percentiles can provide
+# a 90% prediction interval, i.e. the range within which we expect the true
+# value to fall with 90% probability.
+
+from sklearn.metrics import make_scorer, mean_pinball_loss
+
+quantiles = [0.95, 0.05]
+predictions = []
+X = df.drop(columns=["transfer", "class"])
+
+fig, ax = plt.subplots(figsize=(12, 6))
+ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+
+for quantile in quantiles:
+    hgbt = HistGradientBoostingRegressor(loss="quantile", quantile=quantile)
+    hgbt.fit(X.iloc[train_0], y.iloc[train_0])
+    hgbt_predictions = hgbt.predict(X.iloc[test_0])
+
+    predictions.append(hgbt_predictions)
+    cv_results = cross_validate(
+        hgbt,
+        X,
+        y,
+        cv=ts_cv,
+        scoring=make_scorer(mean_pinball_loss, alpha=quantile),
+    )
+    score = cv_results["test_score"]
+    ax.plot(
+        hgbt_predictions[last_days],
+        label=(
+            f"quantile={quantile}, pinball loss={score.mean():.3f} +/-"
+            f" {score.std():.3f}"
+        ),
+        alpha=0.5,
+    )
+
+ax.fill_between(
+    range(len(predictions[0][last_days])),
+    predictions[0][last_days],
+    predictions[1][last_days],
+    color=colors[0],
+    alpha=0.1,
+)
+ax.set(
+    title="Daily energy transfer predictions with quantile loss",
+    xticks=[(i + 0.25) * 48 for i in range(4)],
+    xticklabels=["Tue", "Wed", "Thu", "Fri"],
+    xlabel="Time of the week",
+    ylabel="Normalized energy transfer",
+)
+_ = ax.legend()
+
+# %%
+# Keep in mind that one can still improve the calibration of our model by:
+#
+# - collecting more data-points (in case the model is overfitting);
+# - better tuning of the model hyper-parameters (for instance you could try
+#   max_iter=300, max_leaf_nodes=64) and make sure the model is not over-fitting
+#   too much (e.g. by plotting the validation losses per boosting iteration and
+#   using early stopping);
+# - engineering more predictive features from the same data. This is especially
+#   useful for linear quantile regression (not covered in this tutorial);
+# - try other kinds of quantile regression models, for instance Quantile
+#   Forests.
+#
+# Monotonic Constraints
+# ---------------------
+#
+# Given specific domain knowledge that requires the relationship between a
+# feature and the target to be monotonically increasing or decreasing, one can
+# enforce such behaviour in the predictions of a HGBT model using monotonic
+# constraints. This makes the model more interpretable and prevents overfitting.
+# Monotonic constraints can also be used to enforce specific regulatory
+# requirements, ensure compliance and align with ethical considerations.
+#
+# In the present example, the policy of transfering energy from Victoria to New
+# South Wales is meant to alleviate price fluctuations, meaning that the model
+# predictions have to enforce such goal, i.e. transfer should increase with
+# price and demand in New South Wales, but also decrease with price and demand
+# in Victoria, in order to benefit both populations.
+#
+# To create the monotonic constraints, we use :class:`numpy.select` to assign
+# `1` to the positions corresponding to columns "nswdemand" and "nswprice", `-1`
+# to the positions corresponding to columns "vicdemand" and "vicprice", and `0`
+# elsewhere. We then visualize the partial dependence on said features:
+
+from sklearn.inspection import PartialDependenceDisplay
+
+conditions = [
+    (X.columns == "nswdemand") | (X.columns == "nswprice"),
+    (X.columns == "vicdemand") | (X.columns == "vicprice"),
+]
+choices = [1, -1]
+
+monotonic_cst = np.select(conditions, choices, default=0)
+
+
+gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
+gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
+
+fig, ax = plt.subplots(nrows=2, figsize=(15, 10))
+disp = PartialDependenceDisplay.from_estimator(
+    gbdt_no_cst,
+    X,
+    features=["nswdemand", "nswprice"],
+    line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
+    ax=ax[0],
+)
+
+PartialDependenceDisplay.from_estimator(
+    gbdt_cst,
+    X,
+    features=["nswdemand", "nswprice"],
+    line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
+    ax=disp.axes_,
+)
+disp = PartialDependenceDisplay.from_estimator(
+    gbdt_no_cst,
+    X,
+    features=["vicdemand", "vicprice"],
+    line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
+    ax=ax[1],
+)
+
+PartialDependenceDisplay.from_estimator(
+    gbdt_cst,
+    X,
+    features=["vicdemand", "vicprice"],
+    line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
+    ax=disp.axes_,
+)
+
+plt.legend()
+plt.show()
+
+# %%
+# Indeed, we can verify that the predictive quality of the model is not degraded
+# by introducing the monotonic constraints:
+
+cv_results = cross_validate(
+    gbdt_no_cst,
+    X,
+    y,
+    cv=ts_cv,
+    scoring="neg_root_mean_squared_error",
+)
+rmse = -cv_results["test_score"]
+print(f"RMSE without constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}")
+
+cv_results = cross_validate(
+    gbdt_cst,
+    X,
+    y,
+    cv=ts_cv,
+    scoring="neg_root_mean_squared_error",
+)
+rmse = -cv_results["test_score"]
+print(f"RMSE with constraints    = {rmse.mean():.2f} +/- {rmse.std():.2f}")
diff --git a/examples/release_highlights/plot_release_highlights_1_1_0.py b/examples/release_highlights/plot_release_highlights_1_1_0.py
index f6432cf15037c..088919565315a 100644
--- a/examples/release_highlights/plot_release_highlights_1_1_0.py
+++ b/examples/release_highlights/plot_release_highlights_1_1_0.py
@@ -22,6 +22,8 @@
 """
 
 # %%
+# .. _quantile_support_hgbdt:
+#
 # Quantile loss in :class:`ensemble.HistGradientBoostingRegressor`
 # ----------------------------------------------------------------
 # :class:`~ensemble.HistGradientBoostingRegressor` can model quantiles with

From 9728566a790e2fee3eac2f7e027a9f02b14c1377 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 2 Aug 2023 11:48:51 +0200
Subject: [PATCH 02/52] Replace the landing-page figure

---
 doc/templates/index.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/templates/index.html b/doc/templates/index.html
index fc0362f4e379f..1a83f29f69e9f 100644
--- a/doc/templates/index.html
+++ b/doc/templates/index.html
@@ -70,8 +70,8 @@ <h4 class="sk-landing-subheader text-white font-italic mb-3">Machine Learning in
           and <a href="supervised_learning.html#supervised-learning">more...</a></p>
         </div>
         <div class="overflow-hidden mx-2 text-center flex-fill">
-          <a href="auto_examples/ensemble/plot_adaboost_regression.html"  aria-label="Regression">
-          <img src="_images/sphx_glr_plot_adaboost_regression_thumb.png" class="sk-index-img" alt="Decision Tree Regression with AdaBoost">
+          <a href="auto_examples/ensemble/plot_hgbt_regression.html"  aria-label="Regression">
+          <img src="_images/sphx_glr_plot_hgbt_regression_002.png" class="sk-index-img" alt="Decision Tree Regression with HGBT">
           </a>
         </div>
           <a href="auto_examples/index.html#examples" class="sk-btn-primary btn text-white btn-block" role="button">Examples</a>

From 7842e6d8b7c6b5e9a01452a262ed590a46756941 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 2 Aug 2023 16:20:04 +0200
Subject: [PATCH 03/52] Several tweaks

---
 examples/ensemble/plot_hgbt_regression.py | 32 +++++++++--------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index ba651563a9d33..5f5a2fb6814e6 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -11,10 +11,10 @@
 HGBT models have additional advantages such as:
 
 - :ref:`categorical_support_gbdt` (see
-  :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`)
+  :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
 - :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-- :ref:`Quantile loss support <quantile_support_hgbdt>`
-- :ref:`monotonic_cst_gbdt`
+- :ref:`Quantile loss support <quantile_support_hgbdt>`.
+- :ref:`monotonic_cst_gbdt`.
 
 This example aims at showcasing the last three points in a real setting.
 """
@@ -84,7 +84,7 @@
 
 max_iter_list = [10, 50]
 
-fig, ax = plt.subplots(figsize=(12, 4))
+fig, ax = plt.subplots(figsize=(10, 5))
 average_week_demand = df.groupby(["day", "period"])["transfer"].mean()
 average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax)
 
@@ -243,7 +243,7 @@
 ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
 
 for missing_fraction in missing_fraction_list:
-    X = df.drop(columns=["transfer", "class"])
+    X = df.drop(columns=["transfer", "class"])  # reset X
     mask = df["class"] == "DOWN"
     true_indices = mask[mask].index
     n_keep = int(len(true_indices) * missing_fraction)
@@ -296,7 +296,7 @@
 
 quantiles = [0.95, 0.05]
 predictions = []
-X = df.drop(columns=["transfer", "class"])
+X = df.drop(columns=["transfer", "class"])  # reset X
 
 fig, ax = plt.subplots(figsize=(12, 6))
 ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
@@ -344,17 +344,15 @@
 # Keep in mind that one can still improve the calibration of our model by:
 #
 # - collecting more data-points (in case the model is overfitting);
-# - better tuning of the model hyper-parameters (for instance you could try
-#   max_iter=300, max_leaf_nodes=64) and make sure the model is not over-fitting
-#   too much (e.g. by plotting the validation losses per boosting iteration and
-#   using early stopping);
+# - better tuning of the model hyper-parameters (see
+#   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`)
+#   and make sure the model is not over-fitting;
 # - engineering more predictive features from the same data. This is especially
 #   useful for linear quantile regression (not covered in this tutorial);
-# - try other kinds of quantile regression models, for instance Quantile
-#   Forests.
+# - try other kinds of quantile regression models, such as Quantile Forests.
 #
 # Monotonic Constraints
-# ---------------------
+# =====================
 #
 # Given specific domain knowledge that requires the relationship between a
 # feature and the target to be monotonically increasing or decreasing, one can
@@ -381,10 +379,8 @@
     (X.columns == "vicdemand") | (X.columns == "vicprice"),
 ]
 choices = [1, -1]
-
 monotonic_cst = np.select(conditions, choices, default=0)
 
-
 gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
 gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
 
@@ -396,7 +392,6 @@
     line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
     ax=ax[0],
 )
-
 PartialDependenceDisplay.from_estimator(
     gbdt_cst,
     X,
@@ -411,7 +406,6 @@
     line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
     ax=ax[1],
 )
-
 PartialDependenceDisplay.from_estimator(
     gbdt_cst,
     X,
@@ -419,9 +413,7 @@
     line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
     ax=disp.axes_,
 )
-
-plt.legend()
-plt.show()
+_ = plt.legend()
 
 # %%
 # Indeed, we can verify that the predictive quality of the model is not degraded

From f5ac584476e7843fca4eb52e419e7305d7f91e93 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 2 Aug 2023 16:26:35 +0200
Subject: [PATCH 04/52] Wording

---
 examples/ensemble/plot_hgbt_regression.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 5f5a2fb6814e6..c8aaa86f71d0b 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -344,12 +344,9 @@
 # Keep in mind that one can still improve the calibration of our model by:
 #
 # - collecting more data-points (in case the model is overfitting);
-# - better tuning of the model hyper-parameters (see
-#   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`)
-#   and make sure the model is not over-fitting;
-# - engineering more predictive features from the same data. This is especially
-#   useful for linear quantile regression (not covered in this tutorial);
-# - try other kinds of quantile regression models, such as Quantile Forests.
+# - better tuning of the model hyperparameters (see
+#   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`);
+# - engineering more predictive features from the same data.
 #
 # Monotonic Constraints
 # =====================

From 353329db25288c97d8cd3c5d2ecb08995f12d830 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 2 Aug 2023 18:02:41 +0200
Subject: [PATCH 05/52] Add cross-links from other examples

---
 examples/ensemble/plot_adaboost_regression.py                | 4 ++++
 .../ensemble/plot_forest_hist_grad_boosting_comparison.py    | 4 +++-
 examples/ensemble/plot_gradient_boosting_categorical.py      | 4 ++++
 examples/ensemble/plot_gradient_boosting_quantile.py         | 4 +++-
 examples/ensemble/plot_gradient_boosting_regression.py       | 5 ++++-
 examples/ensemble/plot_hgbt_regression.py                    | 3 ++-
 .../release_highlights/plot_release_highlights_0_23_0.py     | 3 ++-
 examples/release_highlights/plot_release_highlights_1_1_0.py | 3 +++
 examples/release_highlights/plot_release_highlights_1_3_0.py | 4 +++-
 9 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py
index c2aa7e558c07d..98d3699ab161c 100644
--- a/examples/ensemble/plot_adaboost_regression.py
+++ b/examples/ensemble/plot_adaboost_regression.py
@@ -9,6 +9,10 @@
 regressor. As the number of boosts is increased the regressor can fit more
 detail.
 
+See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an
+example showcasing the benefits of using more robust regressions such as
+:class:`~ensemble.HistGradientBoostingRegressor`.
+
 .. [1] `H. Drucker, "Improving Regressors using Boosting Techniques", 1997.
         <https://citeseerx.ist.psu.edu/doc_view/pid/8d49e2dedb817f2c3330e74b63c5fc86d2399ce3>`_
 
diff --git a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py
index 0dde24116065d..853caec241491 100644
--- a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py
+++ b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py
@@ -22,7 +22,9 @@
 the predicted value. RFs, on the other hand, are based on bagging and use a
 majority vote to predict the outcome.
 
-For more information on ensemble models, see the :ref:`User Guide <ensemble>`.
+See the :ref:`User Guide <ensemble>` for more information on ensemble models or
+see :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an
+example showcasing some other features of HGBT models.
 """
 
 # Author:  Arturo Amor <david-arturo.amor-quiroz@inria.fr>
diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py
index 0dd0a84243b4d..d9566f19a8214 100644
--- a/examples/ensemble/plot_gradient_boosting_categorical.py
+++ b/examples/ensemble/plot_gradient_boosting_categorical.py
@@ -21,6 +21,10 @@
 We will work with the Ames Lowa Housing dataset which consists of numerical
 and categorical features, where the houses' sales prices is the target.
 
+See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an
+example showcasing some other features of
+:class:`~ensemble.HistGradientBoostingRegressor`.
+
 """
 
 # %%
diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
index d1464ba92c572..41378db704600 100644
--- a/examples/ensemble/plot_gradient_boosting_quantile.py
+++ b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -4,7 +4,9 @@
 =====================================================
 
 This example shows how quantile regression can be used to create prediction
-intervals.
+intervals. See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`
+for an example showcasing some other features of
+:class:`~ensemble.HistGradientBoostingRegressor`.
 
 """
 
diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index 94705ccfeca24..76437680708be 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -11,7 +11,10 @@
 and 500 regression trees of depth 4.
 
 Note: For larger datasets (n_samples >= 10000), please refer to
-:class:`~sklearn.ensemble.HistGradientBoostingRegressor`.
+:class:`~sklearn.ensemble.HistGradientBoostingRegressor`. See
+:ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an example
+showcasing some other advantages of
+:class:`~ensemble.HistGradientBoostingRegressor`.
 
 """
 
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index c8aaa86f71d0b..0040f261ebbbd 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -346,7 +346,8 @@
 # - collecting more data-points (in case the model is overfitting);
 # - better tuning of the model hyperparameters (see
 #   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`);
-# - engineering more predictive features from the same data.
+# - engineering more predictive features from the same data (see
+#   :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`).
 #
 # Monotonic Constraints
 # =====================
diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py
index 7c6836632e3f0..7753f8653799e 100644
--- a/examples/release_highlights/plot_release_highlights_0_23_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_23_0.py
@@ -122,7 +122,8 @@
 # specific features. In the following example, we construct a target that is
 # generally positively correlated with the first feature, with some noise.
 # Applying monotoinc constraints allows the prediction to capture the global
-# effect of the first feature, instead of fitting the noise.
+# effect of the first feature, instead of fitting the noise. For a usecase
+# example, see :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`.
 import numpy as np
 from matplotlib import pyplot as plt
 from sklearn.model_selection import train_test_split
diff --git a/examples/release_highlights/plot_release_highlights_1_1_0.py b/examples/release_highlights/plot_release_highlights_1_1_0.py
index 088919565315a..63c22d2f22fe5 100644
--- a/examples/release_highlights/plot_release_highlights_1_1_0.py
+++ b/examples/release_highlights/plot_release_highlights_1_1_0.py
@@ -53,6 +53,9 @@
     ax.plot(X_1d, hist.predict(X), label=quantile)
 _ = ax.legend(loc="lower left")
 
+# %%
+# For a usecase example, see
+# :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`
 
 # %%
 # `get_feature_names_out` Available in all Transformers
diff --git a/examples/release_highlights/plot_release_highlights_1_3_0.py b/examples/release_highlights/plot_release_highlights_1_3_0.py
index 8fa1ea057ac91..993a91d18100a 100644
--- a/examples/release_highlights/plot_release_highlights_1_3_0.py
+++ b/examples/release_highlights/plot_release_highlights_1_3_0.py
@@ -88,7 +88,9 @@
 # :class:`tree.DecisionTreeRegressor` now support missing values. For each potential
 # threshold on the non-missing data, the splitter will evaluate the split with all the
 # missing values going to the left node or the right node.
-# More details in the :ref:`User Guide <tree_missing_value_support>`.
+# See more details in the :ref:`User Guide <tree_missing_value_support>` or see
+# :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for a usecase
+# example of this feature in :class:`~ensemble.HistGradientBoostingRegressor`.
 import numpy as np
 from sklearn.tree import DecisionTreeClassifier
 

From 1d56abdda9cc1c5fcdf88170f1dfb3ccc00eabb1 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 2 Aug 2023 18:03:17 +0200
Subject: [PATCH 06/52] Use dictionary to define monotonic_cst

---
 examples/ensemble/plot_hgbt_regression.py | 25 +++++++++++++----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 0040f261ebbbd..4e7deb0e6eb11 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -365,20 +365,23 @@
 # price and demand in New South Wales, but also decrease with price and demand
 # in Victoria, in order to benefit both populations.
 #
-# To create the monotonic constraints, we use :class:`numpy.select` to assign
-# `1` to the positions corresponding to columns "nswdemand" and "nswprice", `-1`
-# to the positions corresponding to columns "vicdemand" and "vicprice", and `0`
-# elsewhere. We then visualize the partial dependence on said features:
+# If the training data has feature names, it’s possible to specify the monotonic
+# constraints by passing a dictionary with the convention:
+# - 1: monotonic increase
+# - 0: no constraint
+# - -1: monotonic decrease
 
 from sklearn.inspection import PartialDependenceDisplay
 
-conditions = [
-    (X.columns == "nswdemand") | (X.columns == "nswprice"),
-    (X.columns == "vicdemand") | (X.columns == "vicprice"),
-]
-choices = [1, -1]
-monotonic_cst = np.select(conditions, choices, default=0)
-
+monotonic_cst = {
+    "date": 0,
+    "day": 0,
+    "period": 0,
+    "nswdemand": 1,
+    "nswprice": 1,
+    "vicdemand": -1,
+    "vicprice": -1,
+}
 gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
 gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
 

From ff89b7c968f6e1d5e8e53fd048ea956bba303aa3 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 3 Aug 2023 11:17:32 +0200
Subject: [PATCH 07/52] Add cross-links in the documentation

---
 doc/modules/ensemble.rst                                  | 8 +++++++-
 examples/ensemble/plot_hgbt_regression.py                 | 1 +
 .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 2 ++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 0585f8289ed55..711cbb6c1f891 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -80,7 +80,8 @@ are not yet supported, for instance some loss functions.
 
 .. topic:: Examples:
 
- * :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py`
+  * :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py`
+  * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`
 
 Usage
 ^^^^^
@@ -169,6 +170,10 @@ If no missing values were encountered for a given feature during training,
 then samples with missing values are mapped to whichever child has the most
 samples.
 
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`
+
 .. _sw_hgbdt:
 
 Sample weight support
@@ -317,6 +322,7 @@ Also, monotonic constraints are not supported for multiclass classification.
 .. topic:: Examples:
 
   * :ref:`sphx_glr_auto_examples_ensemble_plot_monotonic_constraints.py`
+  * :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`
 
 .. _interaction_cst_hgbt:
 
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 4e7deb0e6eb11..42e516fb96cbf 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -367,6 +367,7 @@
 #
 # If the training data has feature names, it’s possible to specify the monotonic
 # constraints by passing a dictionary with the convention:
+#
 # - 1: monotonic increase
 # - 0: no constraint
 # - -1: monotonic decrease
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 5d030d3add5bb..9d6b22b6519f1 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -1200,6 +1200,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     assigned to the left or right child consequently. If no missing values
     were encountered for a given feature during training, then samples with
     missing values are mapped to whichever child has the most samples.
+    See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for a
+    usecase example of this feature.
 
     This implementation is inspired by
     `LightGBM <https://github.com/Microsoft/LightGBM>`_.

From 543d2803d11c5851ccd9717f00b430561b78ad43 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 3 Aug 2023 11:27:04 +0200
Subject: [PATCH 08/52] Change title

---
 examples/ensemble/plot_hgbt_regression.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 42e516fb96cbf..1fedd06f9af21 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -1,7 +1,7 @@
 """
-===========================================================
-Decision Tree Regression with HistGradientBoostingRegressor
-===========================================================
+=============================================================
+Usecase of advanced features in Histogram Boosting Regression
+=============================================================
 
 :ref:`histogram_based_gradient_boosting` (HGBT) models can be a competitive
 alternative to random forests, especially when the number of samples is larger

From b77ab5c152c669282a42e43b8c155a3c02841038 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Thu, 7 Sep 2023 11:43:37 +0200
Subject: [PATCH 09/52] Apply suggestions from code review

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 examples/ensemble/plot_hgbt_regression.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 1fedd06f9af21..b07ae7dcca05f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -3,20 +3,25 @@
 Usecase of advanced features in Histogram Boosting Regression
 =============================================================
 
-:ref:`histogram_based_gradient_boosting` (HGBT) models can be a competitive
-alternative to random forests, especially when the number of samples is larger
+:ref:`histogram_based_gradient_boosting` (HGBT) may be the most useful supervised learning model in scikit-learn. It is a modern gradient boosting implementation
+comparable to LightGBM and XGBoost. As such, it is more feature rich than and often
+outperforms alternative models like random forests, especially when the number of samples is larger
 than tens of thousands of samples (see
 :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
 
-HGBT models have additional advantages such as:
+The top usability features of HGBT models are:
 
 - :ref:`categorical_support_gbdt` (see
   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
 - :ref:`nan_support_hgbt`, which avoids the need for an imputer.
 - :ref:`Quantile loss support <quantile_support_hgbdt>`.
 - :ref:`monotonic_cst_gbdt`.
+- :ref:`_interaction_cst_hgbt`.
+- early stopping
 
-This example aims at showcasing the last three points in a real setting.
+Note that random forests have none of those capabilities.
+
+This example aims at showcasing points 2-4 in a real life setting.
 """
 
 # %%
@@ -287,8 +292,8 @@
 # Support for quantile loss
 # =========================
 #
-# The quantile loss in regression enables a view of the potential variability in
-# predictions. For instance, predicting the 5th and 95th percentiles can provide
+# The quantile loss in regression enables a view of the variability or uncertainty
+# of the target variable. For instance, predicting the 5th and 95th percentiles can provide
 # a 90% prediction interval, i.e. the range within which we expect the true
 # value to fall with 90% probability.
 
@@ -349,7 +354,7 @@
 # - engineering more predictive features from the same data (see
 #   :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`).
 #
-# Monotonic Constraints
+# Monotonic constraints
 # =====================
 #
 # Given specific domain knowledge that requires the relationship between a

From 4689b0f890a6d455e9a74757b88821f0f69ecf90 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 7 Sep 2023 11:52:24 +0200
Subject: [PATCH 10/52] Iter on suggestions from code-review

---
 examples/ensemble/plot_hgbt_regression.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index b07ae7dcca05f..afa1a5dac953b 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -3,10 +3,12 @@
 Usecase of advanced features in Histogram Boosting Regression
 =============================================================
 
-:ref:`histogram_based_gradient_boosting` (HGBT) may be the most useful supervised learning model in scikit-learn. It is a modern gradient boosting implementation
-comparable to LightGBM and XGBoost. As such, it is more feature rich than and often
-outperforms alternative models like random forests, especially when the number of samples is larger
-than tens of thousands of samples (see
+:ref:`histogram_based_gradient_boosting` (HGBT) models may be the most useful
+supervised learning models in scikit-learn. They are based on a modern gradient
+boosting implementation comparable to LightGBM and XGBoost. As such, HGBT models
+are more feature rich than -and often outperforms- alternative models like
+random forests, especially when the number of samples is larger than tens of
+thousands of samples (see
 :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
 
 The top usability features of HGBT models are:
@@ -292,10 +294,10 @@
 # Support for quantile loss
 # =========================
 #
-# The quantile loss in regression enables a view of the variability or uncertainty
-# of the target variable. For instance, predicting the 5th and 95th percentiles can provide
-# a 90% prediction interval, i.e. the range within which we expect the true
-# value to fall with 90% probability.
+# The quantile loss in regression enables a view of the variability or
+# uncertainty of the target variable. For instance, predicting the 5th and 95th
+# percentiles can provide a 90% prediction interval, i.e. the range within which
+# we expect the true value to fall with 90% probability.
 
 from sklearn.metrics import make_scorer, mean_pinball_loss
 

From 86f8f6785161a6ceea99889d3fd40eb85648f5e5 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Tue, 3 Oct 2023 12:18:51 +0200
Subject: [PATCH 11/52] Remove comment that will no longer be true in v1.4

---
 examples/ensemble/plot_hgbt_regression.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index afa1a5dac953b..551ddf1e5ff63 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -21,8 +21,6 @@
 - :ref:`_interaction_cst_hgbt`.
 - early stopping
 
-Note that random forests have none of those capabilities.
-
 This example aims at showcasing points 2-4 in a real life setting.
 """
 

From 35c065ad56b8bccf1496c2842ff8cf84d55a8ef6 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Tue, 3 Oct 2023 12:26:55 +0200
Subject: [PATCH 12/52] Address comment from Christian on calibration

---
 examples/ensemble/plot_hgbt_regression.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 551ddf1e5ff63..8e9935b9efa0e 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -346,7 +346,8 @@
 _ = ax.legend()
 
 # %%
-# Keep in mind that one can still improve the calibration of our model by:
+# Keep in mind that the predicted percentiles are just estimations that depend
+# on the model. One can still improve the quality of such estimations by:
 #
 # - collecting more data-points (in case the model is overfitting);
 # - better tuning of the model hyperparameters (see

From c3e01fc768da1aa2a775c45e3655399fcae74878 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Tue, 3 Oct 2023 12:39:03 +0200
Subject: [PATCH 13/52] Address comment from Christian on bias

---
 examples/ensemble/plot_hgbt_regression.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 8e9935b9efa0e..85eacafd09f37 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -361,8 +361,9 @@
 # Given specific domain knowledge that requires the relationship between a
 # feature and the target to be monotonically increasing or decreasing, one can
 # enforce such behaviour in the predictions of a HGBT model using monotonic
-# constraints. This makes the model more interpretable and prevents overfitting.
-# Monotonic constraints can also be used to enforce specific regulatory
+# constraints. This makes the model more interpretable and can reduce its
+# variance (and potentially mitigate overfitting) at the risk of increasing
+# bias. Monotonic constraints can also be used to enforce specific regulatory
 # requirements, ensure compliance and align with ethical considerations.
 #
 # In the present example, the policy of transfering energy from Victoria to New

From 093b8dd903132e4201f7b8ed3c4f82fba654c888 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Wed, 4 Oct 2023 14:13:14 +0200
Subject: [PATCH 14/52] Apply suggestions from code review

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 examples/ensemble/plot_hgbt_regression.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 85eacafd09f37..73aacb2a0356f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -16,7 +16,7 @@
 - :ref:`categorical_support_gbdt` (see
   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
 - :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-- :ref:`Quantile loss support <quantile_support_hgbdt>`.
+- Support for several losses such as the :ref:`Quantile loss <quantile_support_hgbdt>`.
 - :ref:`monotonic_cst_gbdt`.
 - :ref:`_interaction_cst_hgbt`.
 - early stopping
@@ -346,10 +346,10 @@
 _ = ax.legend()
 
 # %%
-# Keep in mind that the predicted percentiles are just estimations that depend
-# on the model. One can still improve the quality of such estimations by:
+# Keep in mind that those predicted percentiles are just estimations from a
+# model. One can still improve the quality of such estimations by:
 #
-# - collecting more data-points (in case the model is overfitting);
+# - collecting more data-points;
 # - better tuning of the model hyperparameters (see
 #   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`);
 # - engineering more predictive features from the same data (see

From ff2888f02ac7dfa4034ffa77204c9f230da80bb2 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 4 Oct 2023 14:15:48 +0200
Subject: [PATCH 15/52] Iter on suggestions

---
 examples/ensemble/plot_hgbt_regression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 73aacb2a0356f..df4cf837f8ef4 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -18,7 +18,7 @@
 - :ref:`nan_support_hgbt`, which avoids the need for an imputer.
 - Support for several losses such as the :ref:`Quantile loss <quantile_support_hgbdt>`.
 - :ref:`monotonic_cst_gbdt`.
-- :ref:`_interaction_cst_hgbt`.
+- :ref:`interaction_cst_hgbt`.
 - early stopping
 
 This example aims at showcasing points 2-4 in a real life setting.

From 74719599c77be61883e56331336de624edd975c3 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 6 Oct 2023 14:36:19 +0200
Subject: [PATCH 16/52] Silence warning from DataFrame.groupby

---
 examples/ensemble/plot_hgbt_regression.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index df4cf837f8ef4..c9c031ea11125 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -90,7 +90,7 @@
 max_iter_list = [10, 50]
 
 fig, ax = plt.subplots(figsize=(10, 5))
-average_week_demand = df.groupby(["day", "period"])["transfer"].mean()
+average_week_demand = df.groupby(["day", "period"], observed=False)["transfer"].mean()
 average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax)
 
 for idx, max_iter in enumerate(max_iter_list):
@@ -99,7 +99,9 @@
     y_pred = hgbt.predict(X)
     prediction_df = df.copy()
     prediction_df["y_pred"] = y_pred
-    average_pred = prediction_df.groupby(["day", "period"])["y_pred"].mean()
+    average_pred = prediction_df.groupby(["day", "period"], observed=False)[
+        "y_pred"
+    ].mean()
     average_pred.plot(
         color=colors[idx + 1], label=f"max_iter={max_iter}", linewidth=2, ax=ax
     )

From 9a486b896a5661ddcc77db31601318303b131310 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 6 Oct 2023 16:04:50 +0200
Subject: [PATCH 17/52] Add discussion on early stopping

---
 examples/ensemble/plot_hgbt_regression.py | 67 +++++++++++++++++++----
 1 file changed, 56 insertions(+), 11 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index c9c031ea11125..7553eb631ede2 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -15,13 +15,13 @@
 
 - :ref:`categorical_support_gbdt` (see
   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
+- Early stopping.
 - :ref:`nan_support_hgbt`, which avoids the need for an imputer.
 - Support for several losses such as the :ref:`Quantile loss <quantile_support_hgbdt>`.
 - :ref:`monotonic_cst_gbdt`.
 - :ref:`interaction_cst_hgbt`.
-- early stopping
 
-This example aims at showcasing points 2-4 in a real life setting.
+This example aims at showcasing points 2-5 in a real life setting.
 """
 
 # %%
@@ -78,8 +78,8 @@
 # %%
 # Notice energy transfer increases systematically during weekends.
 #
-# Effect of number of trees in HistGradientBoostingRegressor
-# ==========================================================
+# Effect of number of trees and early stopping
+# ============================================
 # For the sake of illustrating the effect of the (maximum) number of trees, we
 # train a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` over the
 # daily electricity transfer using the whole dataset. Then we visualize its
@@ -118,6 +118,52 @@
 # With just a few iterations, HGBT models can achieve convergence (see
 # :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
 #
+# Instead of relying solely on `max_iter` to determine when to stop, the HGBT
+# implementations in scikit-learn support early stopping. With it, the model
+# uses a fraction of the training data as a validation set
+# (`validation_fraction`) and stops training if the validation score does not
+# improve (or degrades) after `n_iter_no_change` iterations up to a certain
+# `tol`.
+#
+# Notice that there is a trade-off between `learning_rate` and `max_iter`:
+# Generally, smaller learning rates require more iterations to converge to the
+# minimum loss, while larger learning rates might converge faster but are at
+# risk of overfitting.
+#
+# Indeed, a good practice is to tune the learning rate along with any other
+# hyperparameters, fit the HBGT on the training set with a large enough value
+# for `max_iter` and determine the best `max_iter` via early stopping and some
+# explicit `validation_fraction`.
+
+common_params = {
+    "max_iter": 1_000,
+    "learning_rate": 0.3,
+    "validation_fraction": 0.2,
+    "random_state": 42,
+    "scoring": "neg_root_mean_squared_error",
+}
+
+hgbt = HistGradientBoostingRegressor(early_stopping=True, **common_params)
+hgbt.fit(X, y)
+plt.plot(-hgbt.validation_score_)
+plt.xlabel("number of iterations")
+plt.ylabel("root mean squared error")
+_ = plt.title(f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})")
+
+# %%
+# We can then overwrite the value for `max_iter` to a razonable value and avoid
+# the extra computational cost of the inner validation. In this case, rounding
+# up the number of iterations to 600 may account for variability of the training
+# set:
+
+common_params["max_iter"] = 600
+common_params["early_stopping"] = False
+hgbt = HistGradientBoostingRegressor(**common_params)
+
+# %%
+# .. note:: The inner validation done during early stopping is not optimal for
+#    time series with the implementation as of scikit-learn v1.3.
+#
 # Support for missing values
 # ==========================
 # HGBT models have native support of missing values. During training, the tree
@@ -148,7 +194,6 @@
 
 fig, ax = plt.subplots(figsize=(12, 6))
 ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
-hgbt = HistGradientBoostingRegressor()
 
 for missing_fraction in missing_fraction_list:
     num_missing_cells = int(total_cells * missing_fraction)
@@ -203,7 +248,6 @@
     X["vicprice"] = X["vicprice"].mask(mask, np.nan)
     X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
 
-    hgbt = HistGradientBoostingRegressor()
     hgbt.fit(X.iloc[train_0], y.iloc[train_0])
     hgbt_predictions = hgbt.predict(X.iloc[test_0])
     cv_results = cross_validate(
@@ -262,7 +306,6 @@
     X["vicprice"] = X["vicprice"].mask(mask, np.nan)
     X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
 
-    hgbt = HistGradientBoostingRegressor()
     hgbt.fit(X.iloc[train_0], y.iloc[train_0])
     hgbt_predictions = hgbt.predict(X.iloc[test_0])
     cv_results = cross_validate(
@@ -309,13 +352,15 @@
 ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
 
 for quantile in quantiles:
-    hgbt = HistGradientBoostingRegressor(loss="quantile", quantile=quantile)
-    hgbt.fit(X.iloc[train_0], y.iloc[train_0])
-    hgbt_predictions = hgbt.predict(X.iloc[test_0])
+    hgbt_quantile = HistGradientBoostingRegressor(
+        loss="quantile", quantile=quantile, **common_params
+    )
+    hgbt_quantile.fit(X.iloc[train_0], y.iloc[train_0])
+    hgbt_predictions = hgbt_quantile.predict(X.iloc[test_0])
 
     predictions.append(hgbt_predictions)
     cv_results = cross_validate(
-        hgbt,
+        hgbt_quantile,
         X,
         y,
         cv=ts_cv,

From 822f3db0a0b4ae538823215242890b8910708bf9 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 6 Oct 2023 16:05:20 +0200
Subject: [PATCH 18/52] Wording

---
 examples/ensemble/plot_hgbt_regression.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 7553eb631ede2..706532edd5d86 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -425,6 +425,8 @@
 # - 1: monotonic increase
 # - 0: no constraint
 # - -1: monotonic decrease
+#
+# Else, one can pass an array-like encoding the above convention by position.
 
 from sklearn.inspection import PartialDependenceDisplay
 
@@ -472,8 +474,8 @@
 _ = plt.legend()
 
 # %%
-# Indeed, we can verify that the predictive quality of the model is not degraded
-# by introducing the monotonic constraints:
+# Indeed, we can verify that the predictive quality of the model is not
+# significantly degraded by introducing the monotonic constraints:
 
 cv_results = cross_validate(
     gbdt_no_cst,
@@ -494,3 +496,8 @@
 )
 rmse = -cv_results["test_score"]
 print(f"RMSE with constraints    = {rmse.mean():.2f} +/- {rmse.std():.2f}")
+
+# %%
+# That being said, notice the comparison is between to different models that may
+# be optimized by a different combination of hyperparameters. That is the reason
+# why we do no use the `common_params` in this section as done before.

From 97cf6426a566a241dc72b28477c19a63f4ed7360 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 6 Oct 2023 16:10:13 +0200
Subject: [PATCH 19/52] Rename instances of hgbt

---
 examples/ensemble/plot_hgbt_regression.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 706532edd5d86..a8d50a5a4f8a7 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -439,33 +439,33 @@
     "vicdemand": -1,
     "vicprice": -1,
 }
-gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
-gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
+hgbt_no_cst = HistGradientBoostingRegressor().fit(X, y)
+hgbt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
 
 fig, ax = plt.subplots(nrows=2, figsize=(15, 10))
 disp = PartialDependenceDisplay.from_estimator(
-    gbdt_no_cst,
+    hgbt_no_cst,
     X,
     features=["nswdemand", "nswprice"],
     line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
     ax=ax[0],
 )
 PartialDependenceDisplay.from_estimator(
-    gbdt_cst,
+    hgbt_cst,
     X,
     features=["nswdemand", "nswprice"],
     line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
     ax=disp.axes_,
 )
 disp = PartialDependenceDisplay.from_estimator(
-    gbdt_no_cst,
+    hgbt_no_cst,
     X,
     features=["vicdemand", "vicprice"],
     line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
     ax=ax[1],
 )
 PartialDependenceDisplay.from_estimator(
-    gbdt_cst,
+    hgbt_cst,
     X,
     features=["vicdemand", "vicprice"],
     line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
@@ -478,7 +478,7 @@
 # significantly degraded by introducing the monotonic constraints:
 
 cv_results = cross_validate(
-    gbdt_no_cst,
+    hgbt_no_cst,
     X,
     y,
     cv=ts_cv,
@@ -488,7 +488,7 @@
 print(f"RMSE without constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}")
 
 cv_results = cross_validate(
-    gbdt_cst,
+    hgbt_cst,
     X,
     y,
     cv=ts_cv,

From 60d8f6118b87cfbd3e336e773a28f59de05714b4 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 6 Oct 2023 16:17:08 +0200
Subject: [PATCH 20/52] Remove distinction on type of missingness

---
 examples/ensemble/plot_hgbt_regression.py | 114 +---------------------
 1 file changed, 4 insertions(+), 110 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index a8d50a5a4f8a7..cde47747de12d 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -173,12 +173,10 @@
 # values during training, samples with missing values for that feature are sent
 # to the child with the most samples.
 #
-# Missing Completely At Random (MCAR)
-# -----------------------------------
-#
-# The missingness does not depend on the observed data or the unobserved data.
-# It's completely random. We can simulate such scenario by randomly replacing
-# values from randomly selected features with `Nan` values.
+# The present example shows how HGBT regressions deal with values missing
+# completely at random (MCAR), i.e. the missingness does not depend on the
+# observed data or the unobserved data. We can simulate such scenario by
+# randomly replacing values from randomly selected features with `Nan` values.
 
 import numpy as np
 
@@ -229,110 +227,6 @@
 )
 _ = ax.legend()
 
-# %%
-# Missing At Random (MAR)
-# -----------------------
-#
-# The missingness depends on the observed data but never on unobserved data.
-# Here, the missingness in "vicdemand" is set to depend on the value of the
-# observed feature "nswprice".
-
-missing_fraction_list = [0, 0.5, 1.0]
-
-fig, ax = plt.subplots(figsize=(12, 6))
-ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
-
-for missing_fraction in missing_fraction_list:
-    X = df.drop(columns=["transfer", "class"])
-    mask = X["nswprice"] < X["nswprice"].quantile(missing_fraction)
-    X["vicprice"] = X["vicprice"].mask(mask, np.nan)
-    X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
-
-    hgbt.fit(X.iloc[train_0], y.iloc[train_0])
-    hgbt_predictions = hgbt.predict(X.iloc[test_0])
-    cv_results = cross_validate(
-        hgbt,
-        X,
-        y,
-        cv=ts_cv,
-        scoring="neg_root_mean_squared_error",
-    )
-    rmse = -cv_results["test_score"]
-    ax.plot(
-        hgbt_predictions[last_days],
-        label=(
-            f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
-            f" {rmse.std():.2f}"
-        ),
-        alpha=0.5,
-    )
-ax.set(
-    title="Daily energy transfer predictions on data with MAR values",
-    xticks=[(i + 0.25) * 48 for i in range(4)],
-    xticklabels=["Tue", "Wed", "Thu", "Fri"],
-    xlabel="Time of the week",
-    ylabel="Normalized energy transfer",
-)
-_ = ax.legend()
-
-# %%
-# In this case the features are highly correlated and therefore MAR values
-# do not degrade the predictivity of the model even when completely removing
-# the feature "vicprice".
-#
-# Missing Not At Random (MNAR)
-# ----------------------------
-#
-# The missingness depends on the unobserved data. In particular, if the
-# probability of a value being missing in a variable is dependent on the values
-# of that variable itself. Here, we set the missingness to depend on the
-# unobserved feature "class".
-
-import pandas as pd
-
-fig, ax = plt.subplots(figsize=(12, 6))
-ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
-
-for missing_fraction in missing_fraction_list:
-    X = df.drop(columns=["transfer", "class"])  # reset X
-    mask = df["class"] == "DOWN"
-    true_indices = mask[mask].index
-    n_keep = int(len(true_indices) * missing_fraction)
-    keep_indices = np.random.choice(true_indices, size=n_keep, replace=False)
-    mask = pd.Series(False, index=mask.index)
-
-    # Set the randomly selected true indices to True in the new mask
-    mask.loc[keep_indices] = True
-    X["vicprice"] = X["vicprice"].mask(mask, np.nan)
-    X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
-
-    hgbt.fit(X.iloc[train_0], y.iloc[train_0])
-    hgbt_predictions = hgbt.predict(X.iloc[test_0])
-    cv_results = cross_validate(
-        hgbt,
-        X,
-        y,
-        cv=ts_cv,
-        scoring="neg_root_mean_squared_error",
-    )
-    rmse = -cv_results["test_score"]
-    ax.plot(
-        hgbt_predictions[last_days],
-        label=(
-            f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
-            f" {rmse.std():.2f}"
-        ),
-        alpha=0.5,
-    )
-ax.set(
-    title="Daily energy transfer predictions on data with MNAR values",
-    xticks=[(i + 0.25) * 48 for i in range(4)],
-    xticklabels=["Tue", "Wed", "Thu", "Fri"],
-    xlabel="Time of the week",
-    ylabel="Normalized energy transfer",
-)
-_ = ax.legend()
-
 # %%
 # Support for quantile loss
 # =========================

From 8799932ad3bd8e19827bf3215adb73aafb7ce994 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Thu, 2 Nov 2023 11:17:11 +0100
Subject: [PATCH 21/52] Apply suggestions from code review

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 examples/ensemble/plot_hgbt_regression.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index cde47747de12d..6e548a54329ad 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -24,11 +24,11 @@
 This example aims at showcasing points 2-5 in a real life setting.
 """
 
-# %%
+
 # Author: Arturo Amor <david-arturo.amor-quiroz@inria.fr>
-#
 # License: BSD 3 clause
-#
+
+# %%
 # Preparing the data
 # ==================
 # The `electricity dataset <http://www.openml.org/d/151>`_ consists of data
@@ -40,7 +40,7 @@
 # The dataset (originally named ELEC2) contains 45,312 instances dated from 7
 # May 1996 to 5 December 1998. Each example of the dataset refers to a period of
 # 30 minutes, i.e. there are 48 instances for each time period of one day. Each
-# example on the dataset has 5 fields, the day of week, the time stamp, the New
+# example on the dataset has 5 fields: the day of week, the time stamp, the New
 # South Wales electricity demand, the Victoria electricity demand. It is
 # originally a classification task, but here we use it as a regression where the
 # target is the scheduled electricity transfer between states.
@@ -151,7 +151,7 @@
 _ = plt.title(f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})")
 
 # %%
-# We can then overwrite the value for `max_iter` to a razonable value and avoid
+# We can then overwrite the value for `max_iter` to a reasonable value and avoid
 # the extra computational cost of the inner validation. In this case, rounding
 # up the number of iterations to 600 may account for variability of the training
 # set:

From c3c883cbb32d3309229f4c54a9239633d8cabac0 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 2 Nov 2023 11:20:06 +0100
Subject: [PATCH 22/52] Use numbered list

---
 examples/ensemble/plot_hgbt_regression.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 6e548a54329ad..70f063c73b9be 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -13,13 +13,14 @@
 
 The top usability features of HGBT models are:
 
-- :ref:`categorical_support_gbdt` (see
+1. :ref:`categorical_support_gbdt` (see
   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
-- Early stopping.
-- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-- Support for several losses such as the :ref:`Quantile loss <quantile_support_hgbdt>`.
-- :ref:`monotonic_cst_gbdt`.
-- :ref:`interaction_cst_hgbt`.
+1. Early stopping.
+1. :ref:`nan_support_hgbt`, which avoids the need for an imputer.
+1. Support for several losses such as the :ref:`Quantile loss
+   <quantile_support_hgbdt>`.
+1. :ref:`monotonic_cst_gbdt`.
+1. :ref:`interaction_cst_hgbt`.
 
 This example aims at showcasing points 2-5 in a real life setting.
 """

From 26ddf3baff54384447621abab683dc34c711d71e Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 9 Nov 2023 16:29:27 +0100
Subject: [PATCH 23/52] Prefer lineplot instead of pairplot

---
 examples/ensemble/plot_hgbt_regression.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 70f063c73b9be..a9d4c35b8dc39 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -65,13 +65,11 @@
 colors = sns.color_palette("colorblind")
 
 fig, ax = plt.subplots(figsize=(15, 10))
-pointplot = sns.pointplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax)
+pointplot = sns.lineplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax)
 handles, lables = ax.get_legend_handles_labels()
 ax.set(
     title="Hourly energy transfer for different days of the week",
-    xticks=[i * 2 for i in range(24)],
-    xticklabels=list(range(24)),
-    xlabel="Time of the day",
+    xlabel="Normalized time of the day",
     ylabel="Normalized energy transfer",
 )
 _ = ax.legend(handles, ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"])

From 4d700387724bc7d7d37645c994d52734227831a6 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 9 Nov 2023 16:31:34 +0100
Subject: [PATCH 24/52] Prefer sample over example

---
 examples/ensemble/plot_hgbt_regression.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index a9d4c35b8dc39..9d215892c744d 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -39,9 +39,9 @@
 # Victoria were done to alleviate fluctuations.
 #
 # The dataset (originally named ELEC2) contains 45,312 instances dated from 7
-# May 1996 to 5 December 1998. Each example of the dataset refers to a period of
+# May 1996 to 5 December 1998. Each sample of the dataset refers to a period of
 # 30 minutes, i.e. there are 48 instances for each time period of one day. Each
-# example on the dataset has 5 fields: the day of week, the time stamp, the New
+# sample on the dataset has 5 fields: the day of week, the time stamp, the New
 # South Wales electricity demand, the Victoria electricity demand. It is
 # originally a classification task, but here we use it as a regression where the
 # target is the scheduled electricity transfer between states.

From 5b0dcfd175b80d13a6203172a1cbd713f0c3c3ca Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 10 Nov 2023 11:51:20 +0100
Subject: [PATCH 25/52] Remove stepwise constant piece of dataset

---
 examples/ensemble/plot_hgbt_regression.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 9d215892c744d..e79f19ceb1335 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -25,7 +25,6 @@
 This example aims at showcasing points 2-5 in a real life setting.
 """
 
-
 # Author: Arturo Amor <david-arturo.amor-quiroz@inria.fr>
 # License: BSD 3 clause
 
@@ -57,13 +56,23 @@
 X
 
 # %%
-# Let us explore the hourly electricity transfer over different days of the week:
+# This particular dataset has a stepwise constant target for the first 17,760
+# samples:
+
+y[:17760].unique()
+
+# %%
+# Let us drop those entries and explore the hourly electricity transfer over
+# different days of the week:
 
 import matplotlib.pyplot as plt
 import seaborn as sns
 
 colors = sns.color_palette("colorblind")
 
+X = X.iloc[17760:]
+y = y.iloc[17760:]
+
 fig, ax = plt.subplots(figsize=(15, 10))
 pointplot = sns.lineplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax)
 handles, lables = ax.get_legend_handles_labels()

From 29146ae8e63dbed075b103847158653a1e24c62f Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 10 Nov 2023 11:52:25 +0100
Subject: [PATCH 26/52] Plot predictions on unseen data

---
 examples/ensemble/plot_hgbt_regression.py | 25 +++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index e79f19ceb1335..0babb87f56b4e 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -91,21 +91,29 @@
 # For the sake of illustrating the effect of the (maximum) number of trees, we
 # train a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` over the
 # daily electricity transfer using the whole dataset. Then we visualize its
-# predictions depending on the `max_iter` parameter.
+# predictions depending on the `max_iter` parameter. Here we don't try to
+# evaluate the performance of the model and its capacity to generalize but
+# rather its capacity to learn from the training data.
 
 from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.model_selection import train_test_split
 
-max_iter_list = [10, 50]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
+
+max_iter_list = [5, 50]
 
 fig, ax = plt.subplots(figsize=(10, 5))
-average_week_demand = df.groupby(["day", "period"], observed=False)["transfer"].mean()
-average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax)
+average_week_demand = (
+    df.loc[X_test.index].groupby(["day", "period"], observed=False)["transfer"].mean()
+)
+average_week_demand.plot(color=colors[0], label="ground truth", linewidth=2, ax=ax)
 
 for idx, max_iter in enumerate(max_iter_list):
     hgbt = HistGradientBoostingRegressor(max_iter=max_iter)
-    hgbt.fit(X, y)
-    y_pred = hgbt.predict(X)
-    prediction_df = df.copy()
+    hgbt.fit(X_train, y_train)
+
+    y_pred = hgbt.predict(X_test)
+    prediction_df = df.loc[X_test.index].copy()
     prediction_df["y_pred"] = y_pred
     average_pred = prediction_df.groupby(["day", "period"], observed=False)[
         "y_pred"
@@ -113,8 +121,9 @@
     average_pred.plot(
         color=colors[idx + 1], label=f"max_iter={max_iter}", linewidth=2, ax=ax
     )
+
 ax.set(
-    title="Average daily energy transfer during the week",
+    title="Predicted average energy transfer during the week",
     xticks=[(i + 0.2) * 48 for i in range(7)],
     xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
     xlabel="Time of the week",

From 25978aeff96746c6a01fd1efdc8e68f18167aac3 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 13 Nov 2023 12:01:40 +0100
Subject: [PATCH 27/52] Refactor code

---
 examples/ensemble/plot_hgbt_regression.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 0babb87f56b4e..a01806845bc0f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -68,10 +68,9 @@
 import matplotlib.pyplot as plt
 import seaborn as sns
 
-colors = sns.color_palette("colorblind")
-
-X = X.iloc[17760:]
-y = y.iloc[17760:]
+df = electricity.frame.iloc[17760:]
+X = df.drop(columns=["transfer", "class"])
+y = df["transfer"]
 
 fig, ax = plt.subplots(figsize=(15, 10))
 pointplot = sns.lineplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax)
@@ -98,14 +97,14 @@
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.model_selection import train_test_split
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
-
 max_iter_list = [5, 50]
-
-fig, ax = plt.subplots(figsize=(10, 5))
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
 average_week_demand = (
     df.loc[X_test.index].groupby(["day", "period"], observed=False)["transfer"].mean()
 )
+
+colors = sns.color_palette("colorblind")
+fig, ax = plt.subplots(figsize=(10, 5))
 average_week_demand.plot(color=colors[0], label="ground truth", linewidth=2, ax=ax)
 
 for idx, max_iter in enumerate(max_iter_list):

From 16a19b124f8475c0ceaab24e919e8add3b9e8d21 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 13 Nov 2023 12:02:17 +0100
Subject: [PATCH 28/52] Use train set for determining max_iter

---
 examples/ensemble/plot_hgbt_regression.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index a01806845bc0f..c4ddadd9fffdf 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -160,7 +160,7 @@
 }
 
 hgbt = HistGradientBoostingRegressor(early_stopping=True, **common_params)
-hgbt.fit(X, y)
+hgbt.fit(X_train, y_train)
 plt.plot(-hgbt.validation_score_)
 plt.xlabel("number of iterations")
 plt.ylabel("root mean squared error")
@@ -169,10 +169,10 @@
 # %%
 # We can then overwrite the value for `max_iter` to a reasonable value and avoid
 # the extra computational cost of the inner validation. In this case, rounding
-# up the number of iterations to 600 may account for variability of the training
+# up the number of iterations to 400 may account for variability of the training
 # set:
 
-common_params["max_iter"] = 600
+common_params["max_iter"] = 400
 common_params["early_stopping"] = False
 hgbt = HistGradientBoostingRegressor(**common_params)
 

From 70c021f2279b8d202be5b3cd6a68d88eee9dbf38 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 13 Nov 2023 16:38:03 +0100
Subject: [PATCH 29/52] Use test set for plots and add generate_missing_values
 function

---
 examples/ensemble/plot_hgbt_regression.py | 101 +++++++++-------------
 1 file changed, 43 insertions(+), 58 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index c4ddadd9fffdf..607041397e38f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -196,52 +196,44 @@
 
 import numpy as np
 
-from sklearn.model_selection import TimeSeriesSplit, cross_validate
+from sklearn.metrics import root_mean_squared_error
+
+rng = np.random.RandomState(42)
+first_week = slice(0, 336)  # first week in the test set as 7 * 48 = 336
+missing_fraction_list = [0, 0.02, 0.05]
 
-np.random.seed(42)
 
-ts_cv = TimeSeriesSplit(n_splits=5, gap=48, max_train_size=10000, test_size=1000)
-train_0, test_0 = next(ts_cv.split(df))
-last_days = slice(-192, None)
-total_cells = X.shape[0] * X.shape[1]
-missing_fraction_list = [0, 0.01, 0.03]
+def generate_missing_values(X, missing_fraction):
+    total_cells = X.shape[0] * X.shape[1]
+    num_missing_cells = int(total_cells * missing_fraction)
+    row_indices = rng.choice(X.shape[0], num_missing_cells, replace=True)
+    col_indices = rng.choice(X.shape[1], num_missing_cells, replace=True)
+    X_missing = X.copy()
+    X_missing.iloc[row_indices, col_indices] = np.nan
+    return X_missing
+
 
 fig, ax = plt.subplots(figsize=(12, 6))
-ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+ax.plot(y_test.values[first_week], label="Actual transfer")
 
 for missing_fraction in missing_fraction_list:
-    num_missing_cells = int(total_cells * missing_fraction)
-    row_indices = np.random.choice(X.shape[0], num_missing_cells, replace=True)
-    col_indices = np.random.choice(X.shape[1], num_missing_cells, replace=True)
-    X = df.drop(columns=["transfer", "class"])
-    X.iloc[row_indices, col_indices] = np.nan
-
-    hgbt.fit(X.iloc[train_0], y.iloc[train_0])
-    hgbt_predictions = hgbt.predict(X.iloc[test_0])
-    cv_results = cross_validate(
-        hgbt,
-        X,
-        y,
-        cv=ts_cv,
-        scoring="neg_root_mean_squared_error",
-    )
-    rmse = -cv_results["test_score"]
+    X_missing = generate_missing_values(X_train, missing_fraction)
+    hgbt.fit(X_missing, y_train)
+    y_pred = hgbt.predict(X_test[first_week])
+    rmse = root_mean_squared_error(y_test[first_week], y_pred)
     ax.plot(
-        hgbt_predictions[last_days],
-        label=(
-            f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
-            f" {rmse.std():.2f}"
-        ),
+        y_pred[first_week],
+        label=f"missing_fraction={missing_fraction}, RMSE={rmse:.2f}",
         alpha=0.5,
     )
 ax.set(
     title="Daily energy transfer predictions on data with MCAR values",
-    xticks=[(i + 0.25) * 48 for i in range(4)],
-    xticklabels=["Tue", "Wed", "Thu", "Fri"],
+    xticks=[(i + 0.2) * 48 for i in range(7)],
+    xticklabels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"],
     xlabel="Time of the week",
     ylabel="Normalized energy transfer",
 )
-_ = ax.legend()
+_ = ax.legend(loc="lower right")
 
 # %%
 # Support for quantile loss
@@ -252,55 +244,44 @@
 # percentiles can provide a 90% prediction interval, i.e. the range within which
 # we expect the true value to fall with 90% probability.
 
-from sklearn.metrics import make_scorer, mean_pinball_loss
+from sklearn.metrics import mean_pinball_loss
 
 quantiles = [0.95, 0.05]
 predictions = []
-X = df.drop(columns=["transfer", "class"])  # reset X
 
 fig, ax = plt.subplots(figsize=(12, 6))
-ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+ax.plot(y_test.values[first_week], label="Actual transfer")
 
 for quantile in quantiles:
     hgbt_quantile = HistGradientBoostingRegressor(
         loss="quantile", quantile=quantile, **common_params
     )
-    hgbt_quantile.fit(X.iloc[train_0], y.iloc[train_0])
-    hgbt_predictions = hgbt_quantile.predict(X.iloc[test_0])
-
-    predictions.append(hgbt_predictions)
-    cv_results = cross_validate(
-        hgbt_quantile,
-        X,
-        y,
-        cv=ts_cv,
-        scoring=make_scorer(mean_pinball_loss, alpha=quantile),
-    )
-    score = cv_results["test_score"]
+    hgbt_quantile.fit(X_train, y_train)
+    y_pred = hgbt_quantile.predict(X_test[first_week])
+
+    predictions.append(y_pred)
+    score = mean_pinball_loss(y_test[first_week], y_pred)
     ax.plot(
-        hgbt_predictions[last_days],
-        label=(
-            f"quantile={quantile}, pinball loss={score.mean():.3f} +/-"
-            f" {score.std():.3f}"
-        ),
+        y_pred[first_week],
+        label=f"quantile={quantile}, pinball loss={score:.2f}",
         alpha=0.5,
     )
 
 ax.fill_between(
-    range(len(predictions[0][last_days])),
-    predictions[0][last_days],
-    predictions[1][last_days],
+    range(len(predictions[0][first_week])),
+    predictions[0][first_week],
+    predictions[1][first_week],
     color=colors[0],
     alpha=0.1,
 )
 ax.set(
     title="Daily energy transfer predictions with quantile loss",
-    xticks=[(i + 0.25) * 48 for i in range(4)],
-    xticklabels=["Tue", "Wed", "Thu", "Fri"],
+    xticks=[(i + 0.2) * 48 for i in range(7)],
+    xticklabels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"],
     xlabel="Time of the week",
     ylabel="Normalized energy transfer",
 )
-_ = ax.legend()
+_ = ax.legend(loc="lower right")
 
 # %%
 # Keep in mind that those predicted percentiles are just estimations from a
@@ -387,6 +368,10 @@
 # Indeed, we can verify that the predictive quality of the model is not
 # significantly degraded by introducing the monotonic constraints:
 
+from sklearn.model_selection import TimeSeriesSplit, cross_validate
+
+ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336)
+
 cv_results = cross_validate(
     hgbt_no_cst,
     X,

From 5cf52c27b3a328a0e65638036e9bd5db470472a7 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 13 Nov 2023 16:44:53 +0100
Subject: [PATCH 30/52] Reference the problem of coverage

---
 examples/ensemble/plot_gradient_boosting_quantile.py | 1 +
 examples/ensemble/plot_hgbt_regression.py            | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
index 41378db704600..a01f0d2d1e8b6 100644
--- a/examples/ensemble/plot_gradient_boosting_quantile.py
+++ b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -192,6 +192,7 @@ def highlight_min(x):
 # (underestimation for this asymmetric noise) but is also naturally robust to
 # outliers and overfits less.
 #
+# .. _calibration-section:
 # Calibration of the confidence interval
 # --------------------------------------
 #
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 607041397e38f..66ebc598c40b5 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -284,6 +284,9 @@ def generate_missing_values(X, missing_fraction):
 _ = ax.legend(loc="lower right")
 
 # %%
+# We observe a tendence to over-estimate the energy transfer. This could be be
+# quantitatively confirmed by computing empirical coverage numbers as done in
+# the :ref:`calibration of confidence intervals section <calibration-section>`.
 # Keep in mind that those predicted percentiles are just estimations from a
 # model. One can still improve the quality of such estimations by:
 #

From 214a0838c2ae182b351dd1f9a25791a41ec9babe Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 13 Nov 2023 16:46:33 +0100
Subject: [PATCH 31/52] Fix typo

---
 examples/ensemble/plot_hgbt_regression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 66ebc598c40b5..19f14cc3551f9 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -396,6 +396,6 @@ def generate_missing_values(X, missing_fraction):
 print(f"RMSE with constraints    = {rmse.mean():.2f} +/- {rmse.std():.2f}")
 
 # %%
-# That being said, notice the comparison is between to different models that may
+# That being said, notice the comparison is between two different models that may
 # be optimized by a different combination of hyperparameters. That is the reason
 # why we do no use the `common_params` in this section as done before.

From 64ff62960b53b36aea06a831ff76258b62258d94 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Tue, 14 Nov 2023 15:45:34 +0100
Subject: [PATCH 32/52] Apply suggestions from code review

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 examples/ensemble/plot_hgbt_regression.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 19f14cc3551f9..fc8b07cce518a 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -14,11 +14,10 @@
 The top usability features of HGBT models are:
 
 1. :ref:`categorical_support_gbdt` (see
-  :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
+   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
 1. Early stopping.
 1. :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-1. Support for several losses such as the :ref:`Quantile loss
-   <quantile_support_hgbdt>`.
+1. Support for several losses such as the :ref:`Quantile loss <quantile_support_hgbdt>`.
 1. :ref:`monotonic_cst_gbdt`.
 1. :ref:`interaction_cst_hgbt`.
 
@@ -59,7 +58,7 @@
 # This particular dataset has a stepwise constant target for the first 17,760
 # samples:
 
-y[:17760].unique()
+y[:17_760].unique()
 
 # %%
 # Let us drop those entries and explore the hourly electricity transfer over
@@ -192,7 +191,7 @@
 # The present example shows how HGBT regressions deal with values missing
 # completely at random (MCAR), i.e. the missingness does not depend on the
 # observed data or the unobserved data. We can simulate such scenario by
-# randomly replacing values from randomly selected features with `Nan` values.
+# randomly replacing values from randomly selected features with `nan` values.
 
 import numpy as np
 

From 604283e0e7b1e6dd10a0db3376f0229f8a23def7 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Tue, 14 Nov 2023 17:13:20 +0100
Subject: [PATCH 33/52] Prefer ax instead of plt to plot

---
 examples/ensemble/plot_hgbt_regression.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index fc8b07cce518a..1354751f6ba83 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -67,7 +67,7 @@
 import matplotlib.pyplot as plt
 import seaborn as sns
 
-df = electricity.frame.iloc[17760:]
+df = electricity.frame.iloc[17_760:]
 X = df.drop(columns=["transfer", "class"])
 y = df["transfer"]
 
@@ -160,10 +160,15 @@
 
 hgbt = HistGradientBoostingRegressor(early_stopping=True, **common_params)
 hgbt.fit(X_train, y_train)
+
+_, ax = plt.subplots()
 plt.plot(-hgbt.validation_score_)
-plt.xlabel("number of iterations")
-plt.ylabel("root mean squared error")
-_ = plt.title(f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})")
+ax.set(
+    xlabel="number of iterations",
+    ylabel="root mean squared error",
+    title=f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})",
+)
+_ = ax.legend()
 
 # %%
 # We can then overwrite the value for `max_iter` to a reasonable value and avoid

From 11d165c614a5fd34b8854c91e4cc94f5de44fe3e Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Tue, 14 Nov 2023 17:31:12 +0100
Subject: [PATCH 34/52] Add brief interpretation of plot

---
 examples/ensemble/plot_hgbt_regression.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 1354751f6ba83..54d5bf35c9f07 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -163,12 +163,11 @@
 
 _, ax = plt.subplots()
 plt.plot(-hgbt.validation_score_)
-ax.set(
+_ = ax.set(
     xlabel="number of iterations",
     ylabel="root mean squared error",
     title=f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})",
 )
-_ = ax.legend()
 
 # %%
 # We can then overwrite the value for `max_iter` to a reasonable value and avoid
@@ -204,7 +203,7 @@
 
 rng = np.random.RandomState(42)
 first_week = slice(0, 336)  # first week in the test set as 7 * 48 = 336
-missing_fraction_list = [0, 0.02, 0.05]
+missing_fraction_list = [0, 0.01, 0.03]
 
 
 def generate_missing_values(X, missing_fraction):
@@ -221,9 +220,10 @@ def generate_missing_values(X, missing_fraction):
 ax.plot(y_test.values[first_week], label="Actual transfer")
 
 for missing_fraction in missing_fraction_list:
-    X_missing = generate_missing_values(X_train, missing_fraction)
-    hgbt.fit(X_missing, y_train)
-    y_pred = hgbt.predict(X_test[first_week])
+    X_train_missing = generate_missing_values(X_train, missing_fraction)
+    X_test_missing = generate_missing_values(X_test, missing_fraction)
+    hgbt.fit(X_train_missing, y_train)
+    y_pred = hgbt.predict(X_test_missing[first_week])
     rmse = root_mean_squared_error(y_test[first_week], y_pred)
     ax.plot(
         y_pred[first_week],
@@ -240,6 +240,8 @@ def generate_missing_values(X, missing_fraction):
 _ = ax.legend(loc="lower right")
 
 # %%
+# As expected, the model degrades as the proportion of missing values increases.
+#
 # Support for quantile loss
 # =========================
 #

From 3abb0c4658159a0ec2746b2a74dc54ac38d1f73b Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Tue, 21 Nov 2023 17:19:46 +0100
Subject: [PATCH 35/52] Revert use of numbered list

---
 examples/ensemble/plot_hgbt_regression.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 54d5bf35c9f07..dd69757b5ed35 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -13,13 +13,13 @@
 
 The top usability features of HGBT models are:
 
-1. :ref:`categorical_support_gbdt` (see
-   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
-1. Early stopping.
-1. :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-1. Support for several losses such as the :ref:`Quantile loss <quantile_support_hgbdt>`.
-1. :ref:`monotonic_cst_gbdt`.
-1. :ref:`interaction_cst_hgbt`.
+- :ref:`categorical_support_gbdt` (see
+  :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
+- Early stopping.
+- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
+- Support for several losses such as the :ref:`Quantile loss <quantile_support_hgbdt>`.
+- :ref:`monotonic_cst_gbdt`.
+- :ref:`interaction_cst_hgbt`.
 
 This example aims at showcasing points 2-5 in a real life setting.
 """

From 7c8406820dfaeeb11f39dae1b2226860d183e78a Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Sat, 6 Jan 2024 20:22:17 +0100
Subject: [PATCH 36/52] Apply suggestions from code review

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 examples/ensemble/plot_hgbt_regression.py | 58 +++++++++++------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index dd69757b5ed35..af43d8ee2ae24 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -1,27 +1,27 @@
 """
-=============================================================
-Usecase of advanced features in Histogram Boosting Regression
-=============================================================
+===================================================================
+Use cases of advanced features in Histogram Gradient Boosting Trees
+===================================================================
 
-:ref:`histogram_based_gradient_boosting` (HGBT) models may be the most useful
+:ref:`histogram_based_gradient_boosting` (HGBT) models may be one of the most useful
 supervised learning models in scikit-learn. They are based on a modern gradient
 boosting implementation comparable to LightGBM and XGBoost. As such, HGBT models
-are more feature rich than -and often outperforms- alternative models like
-random forests, especially when the number of samples is larger than tens of
-thousands of samples (see
+are more feature rich than and often outperform alternative models like
+random forests, especially when the number of samples is larger than some ten
+thousands (see
 :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
 
 The top usability features of HGBT models are:
 
-- :ref:`categorical_support_gbdt` (see
+1. Several available loss function for mean and quantile regression tasks, see :ref:`Quantile loss <quantile_support_hgbdt>`.
+2. :ref:`categorical_support_gbdt` (see
   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
-- Early stopping.
-- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-- Support for several losses such as the :ref:`Quantile loss <quantile_support_hgbdt>`.
-- :ref:`monotonic_cst_gbdt`.
-- :ref:`interaction_cst_hgbt`.
+3. Early stopping.
+4. :ref:`nan_support_hgbt`, which avoids the need for an imputer.
+5. :ref:`monotonic_cst_gbdt`.
+6. :ref:`interaction_cst_hgbt`.
 
-This example aims at showcasing points 2-5 in a real life setting.
+This example aims at showcasing all points except 2 and 6 in a real life setting.
 """
 
 # Author: Arturo Amor <david-arturo.amor-quiroz@inria.fr>
@@ -41,8 +41,8 @@
 # 30 minutes, i.e. there are 48 instances for each time period of one day. Each
 # sample on the dataset has 5 fields: the day of week, the time stamp, the New
 # South Wales electricity demand, the Victoria electricity demand. It is
-# originally a classification task, but here we use it as a regression where the
-# target is the scheduled electricity transfer between states.
+# originally a classification task, but here we use it for the regression task
+# to predict the scheduled electricity transfer between states.
 
 from sklearn.datasets import fetch_openml
 
@@ -104,7 +104,7 @@
 
 colors = sns.color_palette("colorblind")
 fig, ax = plt.subplots(figsize=(10, 5))
-average_week_demand.plot(color=colors[0], label="ground truth", linewidth=2, ax=ax)
+average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax)
 
 for idx, max_iter in enumerate(max_iter_list):
     hgbt = HistGradientBoostingRegressor(max_iter=max_iter)
@@ -131,19 +131,19 @@
 
 # %%
 # With just a few iterations, HGBT models can achieve convergence (see
-# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
+# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`),
+# meaning that adding more trees does not improve the model anymore.
 #
-# Instead of relying solely on `max_iter` to determine when to stop, the HGBT
-# implementations in scikit-learn support early stopping. With it, the model
-# uses a fraction of the training data as a validation set
+# Instead of relying on `max_iter` alone to determine when to stop, the HGBT
+# implementation in scikit-learn supports early stopping. With it, the model
+# uses a fraction of the training data as internal validation set
 # (`validation_fraction`) and stops training if the validation score does not
 # improve (or degrades) after `n_iter_no_change` iterations up to a certain
 # `tol`.
 #
 # Notice that there is a trade-off between `learning_rate` and `max_iter`:
-# Generally, smaller learning rates require more iterations to converge to the
-# minimum loss, while larger learning rates might converge faster but are at
-# risk of overfitting.
+# Generally, smaller learning rates are preferable but require more iterations to converge to the
+# minimum loss, while larger learning rates converge faster (less iterations/trees needed) but at the cost of a larger minimum loss.
 #
 # Indeed, a good practice is to tune the learning rate along with any other
 # hyperparameters, fit the HBGT on the training set with a large enough value
@@ -181,7 +181,7 @@
 
 # %%
 # .. note:: The inner validation done during early stopping is not optimal for
-#    time series with the implementation as of scikit-learn v1.3.
+#    time series.
 #
 # Support for missing values
 # ==========================
@@ -227,7 +227,7 @@ def generate_missing_values(X, missing_fraction):
     rmse = root_mean_squared_error(y_test[first_week], y_pred)
     ax.plot(
         y_pred[first_week],
-        label=f"missing_fraction={missing_fraction}, RMSE={rmse:.2f}",
+        label=f"missing_fraction={missing_fraction}, RMSE={rmse:.3f}",
         alpha=0.5,
     )
 ax.set(
@@ -248,7 +248,7 @@ def generate_missing_values(X, missing_fraction):
 # The quantile loss in regression enables a view of the variability or
 # uncertainty of the target variable. For instance, predicting the 5th and 95th
 # percentiles can provide a 90% prediction interval, i.e. the range within which
-# we expect the true value to fall with 90% probability.
+# we expect a new observed value to fall with 90% probability.
 
 from sklearn.metrics import mean_pinball_loss
 
@@ -389,7 +389,7 @@ def generate_missing_values(X, missing_fraction):
     scoring="neg_root_mean_squared_error",
 )
 rmse = -cv_results["test_score"]
-print(f"RMSE without constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}")
+print(f"RMSE without constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}")
 
 cv_results = cross_validate(
     hgbt_cst,
@@ -399,7 +399,7 @@ def generate_missing_values(X, missing_fraction):
     scoring="neg_root_mean_squared_error",
 )
 rmse = -cv_results["test_score"]
-print(f"RMSE with constraints    = {rmse.mean():.2f} +/- {rmse.std():.2f}")
+print(f"RMSE with constraints    = {rmse.mean():.3f} +/- {rmse.std():.3f}")
 
 # %%
 # That being said, notice the comparison is between two different models that may

From dcdf851d959884454739573dd2becc24dd9793ce Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Sat, 6 Jan 2024 20:48:04 +0100
Subject: [PATCH 37/52] Lint

---
 examples/ensemble/plot_hgbt_regression.py | 29 +++++++++++++----------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index af43d8ee2ae24..5be4014bcb95f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -3,17 +3,18 @@
 Use cases of advanced features in Histogram Gradient Boosting Trees
 ===================================================================
 
-:ref:`histogram_based_gradient_boosting` (HGBT) models may be one of the most useful
-supervised learning models in scikit-learn. They are based on a modern gradient
-boosting implementation comparable to LightGBM and XGBoost. As such, HGBT models
-are more feature rich than and often outperform alternative models like
-random forests, especially when the number of samples is larger than some ten
-thousands (see
+:ref:`histogram_based_gradient_boosting` (HGBT) models may be one of the most
+useful supervised learning models in scikit-learn. They are based on a modern
+gradient boosting implementation comparable to LightGBM and XGBoost. As such,
+HGBT models are more feature rich than and often outperform alternative models
+like random forests, especially when the number of samples is larger than some
+ten thousands (see
 :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
 
 The top usability features of HGBT models are:
 
-1. Several available loss function for mean and quantile regression tasks, see :ref:`Quantile loss <quantile_support_hgbdt>`.
+1. Several available loss function for mean and quantile regression tasks, see
+   :ref:`Quantile loss <quantile_support_hgbdt>`.
 2. :ref:`categorical_support_gbdt` (see
   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
 3. Early stopping.
@@ -21,7 +22,8 @@
 5. :ref:`monotonic_cst_gbdt`.
 6. :ref:`interaction_cst_hgbt`.
 
-This example aims at showcasing all points except 2 and 6 in a real life setting.
+This example aims at showcasing all points except 2 and 6 in a real life
+setting.
 """
 
 # Author: Arturo Amor <david-arturo.amor-quiroz@inria.fr>
@@ -142,8 +144,9 @@
 # `tol`.
 #
 # Notice that there is a trade-off between `learning_rate` and `max_iter`:
-# Generally, smaller learning rates are preferable but require more iterations to converge to the
-# minimum loss, while larger learning rates converge faster (less iterations/trees needed) but at the cost of a larger minimum loss.
+# Generally, smaller learning rates are preferable but require more iterations
+# to converge to the minimum loss, while larger learning rates converge faster
+# (less iterations/trees needed) but at the cost of a larger minimum loss.
 #
 # Indeed, a good practice is to tune the learning rate along with any other
 # hyperparameters, fit the HBGT on the training set with a large enough value
@@ -402,6 +405,6 @@ def generate_missing_values(X, missing_fraction):
 print(f"RMSE with constraints    = {rmse.mean():.3f} +/- {rmse.std():.3f}")
 
 # %%
-# That being said, notice the comparison is between two different models that may
-# be optimized by a different combination of hyperparameters. That is the reason
-# why we do no use the `common_params` in this section as done before.
+# That being said, notice the comparison is between two different models that
+# may be optimized by a different combination of hyperparameters. That is the
+# reason why we do no use the `common_params` in this section as done before.

From ab0e21a44d4fbe87ab331d30061d0ebd9455abf5 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Tue, 16 Jan 2024 18:12:01 +0100
Subject: [PATCH 38/52] Fix FutureWarning

---
 examples/ensemble/plot_hgbt_regression.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 5be4014bcb95f..65448b23d9681 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -109,7 +109,7 @@
 average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax)
 
 for idx, max_iter in enumerate(max_iter_list):
-    hgbt = HistGradientBoostingRegressor(max_iter=max_iter)
+    hgbt = HistGradientBoostingRegressor(max_iter=max_iter, categorical_features=None)
     hgbt.fit(X_train, y_train)
 
     y_pred = hgbt.predict(X_test)
@@ -158,6 +158,7 @@
     "learning_rate": 0.3,
     "validation_fraction": 0.2,
     "random_state": 42,
+    "categorical_features": None,
     "scoring": "neg_root_mean_squared_error",
 }
 
@@ -342,8 +343,10 @@ def generate_missing_values(X, missing_fraction):
     "vicdemand": -1,
     "vicprice": -1,
 }
-hgbt_no_cst = HistGradientBoostingRegressor().fit(X, y)
-hgbt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
+hgbt_no_cst = HistGradientBoostingRegressor(categorical_features=None).fit(X, y)
+hgbt_cst = HistGradientBoostingRegressor(
+    monotonic_cst=monotonic_cst, categorical_features=None
+).fit(X, y)
 
 fig, ax = plt.subplots(nrows=2, figsize=(15, 10))
 disp = PartialDependenceDisplay.from_estimator(

From c4d1b3b7938ba576eda7e860ee362a34ef9cae8b Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Tue, 16 Jan 2024 18:26:37 +0100
Subject: [PATCH 39/52] List of features as suggested by Christian

---
 examples/ensemble/plot_hgbt_regression.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 65448b23d9681..ca2a68f018f01 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -41,10 +41,15 @@
 # The dataset (originally named ELEC2) contains 45,312 instances dated from 7
 # May 1996 to 5 December 1998. Each sample of the dataset refers to a period of
 # 30 minutes, i.e. there are 48 instances for each time period of one day. Each
-# sample on the dataset has 5 fields: the day of week, the time stamp, the New
-# South Wales electricity demand, the Victoria electricity demand. It is
-# originally a classification task, but here we use it for the regression task
-# to predict the scheduled electricity transfer between states.
+# sample on the dataset has 7 columns:
+#   - date: between 7 May 1996 to 5 December 1998. Normalized between 0 and 1;
+#   - day: day of week (1-7);
+#   - period: half hour intervals over 24 hours. Normalized between 0 and 1;
+#   - nswprice/nswdemand: electricity price/demand of New South Wales;
+#   - vicprice/vicdemand: electricity price/demand of Victoria.
+#
+# It is originally a classification task, but here we use it for the regression
+# task to predict the scheduled electricity transfer between states.
 
 from sklearn.datasets import fetch_openml
 

From 49587ab0b826b12acb1e8717f78b2d9d51c50ffa Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Wed, 17 Jan 2024 11:22:02 +0100
Subject: [PATCH 40/52] Simplify code

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 examples/ensemble/plot_hgbt_regression.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index ca2a68f018f01..230c1a20b62ab 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -57,15 +57,12 @@
     name="electricity", version=1, as_frame=True, parser="pandas"
 )
 df = electricity.frame
-X = df.drop(columns=["transfer", "class"])
-y = df["transfer"]
-X
 
 # %%
 # This particular dataset has a stepwise constant target for the first 17,760
 # samples:
 
-y[:17_760].unique()
+df["transfer"][:17_760,].unique()
 
 # %%
 # Let us drop those entries and explore the hourly electricity transfer over

From 42c17427acccf6ffc67fc6d95d49566c3f5f48f3 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 17 Jan 2024 11:50:02 +0100
Subject: [PATCH 41/52] Print simple stats

---
 examples/ensemble/plot_hgbt_regression.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 230c1a20b62ab..67b15d8be360f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -62,7 +62,7 @@
 # This particular dataset has a stepwise constant target for the first 17,760
 # samples:
 
-df["transfer"][:17_760,].unique()
+df["transfer"][:17_760].unique()
 
 # %%
 # Let us drop those entries and explore the hourly electricity transfer over
@@ -100,12 +100,17 @@
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.model_selection import train_test_split
 
-max_iter_list = [5, 50]
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
+
+print(f"Training sample size: {X_train.shape[0]}")
+print(f"Test sample size: {X_test.shape[0]}")
+print(f"Number of features: {X_train.shape[1]}")
+
+# %%
+max_iter_list = [5, 50]
 average_week_demand = (
     df.loc[X_test.index].groupby(["day", "period"], observed=False)["transfer"].mean()
 )
-
 colors = sns.color_palette("colorblind")
 fig, ax = plt.subplots(figsize=(10, 5))
 average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax)

From 37bb831e558c8f16bd1b0557f641340e7cd77c6c Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 17 Jan 2024 15:10:03 +0100
Subject: [PATCH 42/52] Fix indentation

---
 examples/ensemble/plot_hgbt_regression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 67b15d8be360f..adce5c6b36e1a 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -16,7 +16,7 @@
 1. Several available loss function for mean and quantile regression tasks, see
    :ref:`Quantile loss <quantile_support_hgbdt>`.
 2. :ref:`categorical_support_gbdt` (see
-  :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
+   :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
 3. Early stopping.
 4. :ref:`nan_support_hgbt`, which avoids the need for an imputer.
 5. :ref:`monotonic_cst_gbdt`.

From d1b809a5e8ba56c4dd5a7126039b935818446c55 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 17 Jan 2024 15:10:57 +0100
Subject: [PATCH 43/52] Use programmatic way to round up n_iter

---
 examples/ensemble/plot_hgbt_regression.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index adce5c6b36e1a..92efcb53e365a 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -182,11 +182,12 @@
 
 # %%
 # We can then overwrite the value for `max_iter` to a reasonable value and avoid
-# the extra computational cost of the inner validation. In this case, rounding
-# up the number of iterations to 400 may account for variability of the training
-# set:
+# the extra computational cost of the inner validation. Rounding up the number
+# of iterations may account for variability of the training set:
 
-common_params["max_iter"] = 400
+import math
+
+common_params["max_iter"] = math.ceil(hgbt.n_iter_ / 100) * 100
 common_params["early_stopping"] = False
 hgbt = HistGradientBoostingRegressor(**common_params)
 

From 5b1875528d51194a450029e5782896b4058c47d5 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 17 Jan 2024 15:11:25 +0100
Subject: [PATCH 44/52] Set random state for deterministic results

---
 examples/ensemble/plot_hgbt_regression.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 92efcb53e365a..b6825660a8b56 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -351,9 +351,11 @@ def generate_missing_values(X, missing_fraction):
     "vicdemand": -1,
     "vicprice": -1,
 }
-hgbt_no_cst = HistGradientBoostingRegressor(categorical_features=None).fit(X, y)
+hgbt_no_cst = HistGradientBoostingRegressor(
+    categorical_features=None, random_state=42
+).fit(X, y)
 hgbt_cst = HistGradientBoostingRegressor(
-    monotonic_cst=monotonic_cst, categorical_features=None
+    monotonic_cst=monotonic_cst, categorical_features=None, random_state=42
 ).fit(X, y)
 
 fig, ax = plt.subplots(nrows=2, figsize=(15, 10))

From 9499e611a7b2159c0685a0195cdaad37902f55ac Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 17 Jan 2024 15:24:09 +0100
Subject: [PATCH 45/52] Add explanation on time-aware cross validation

---
 examples/ensemble/plot_hgbt_regression.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index b6825660a8b56..71a1109db7b84 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -391,11 +391,15 @@ def generate_missing_values(X, missing_fraction):
 
 # %%
 # Indeed, we can verify that the predictive quality of the model is not
-# significantly degraded by introducing the monotonic constraints:
+# significantly degraded by introducing the monotonic constraints. For such
+# purpose we use :class:`~sklearn.model_selection.TimeSeriesSplit`
+# cross-validation to estimate the variance of the test score. By doing so we
+# guarantee that the training data does not succeed the testing data, which is
+# crucial when dealing with data that have a temporal relationship.
 
 from sklearn.model_selection import TimeSeriesSplit, cross_validate
 
-ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336)
+ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336)  # a week has 336 samples
 
 cv_results = cross_validate(
     hgbt_no_cst,

From 3b1789e4547db60f0f05a6483e3ad268e4037285 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 17 Jan 2024 15:28:07 +0100
Subject: [PATCH 46/52] Add comment on overcronstraining feature

---
 examples/ensemble/plot_hgbt_regression.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 71a1109db7b84..8a6e7d4128cd0 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -390,7 +390,10 @@ def generate_missing_values(X, missing_fraction):
 _ = plt.legend()
 
 # %%
-# Indeed, we can verify that the predictive quality of the model is not
+# Observe that `nswdemand` seems already monotonic without constraint. This is a
+# good example to show that the model is "overconstraining".
+#
+# Additionally, we can verify that the predictive quality of the model is not
 # significantly degraded by introducing the monotonic constraints. For such
 # purpose we use :class:`~sklearn.model_selection.TimeSeriesSplit`
 # cross-validation to estimate the variance of the test score. By doing so we

From d972fae3ec6653ef14aa121f6e995503f6fec738 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Tue, 23 Jan 2024 10:50:10 +0100
Subject: [PATCH 47/52] Apply suggestion from Guillaume

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 examples/ensemble/plot_hgbt_regression.py | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 8a6e7d4128cd0..26531bc8fd9a6 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -400,28 +400,18 @@ def generate_missing_values(X, missing_fraction):
 # guarantee that the training data does not succeed the testing data, which is
 # crucial when dealing with data that have a temporal relationship.
 
+from sklearn.metrics import make_scorer, root_mean_squared_error
 from sklearn.model_selection import TimeSeriesSplit, cross_validate
 
 ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336)  # a week has 336 samples
+scorer = make_scorer(root_mean_squared_error)
 
-cv_results = cross_validate(
-    hgbt_no_cst,
-    X,
-    y,
-    cv=ts_cv,
-    scoring="neg_root_mean_squared_error",
-)
-rmse = -cv_results["test_score"]
+cv_results = cross_validate(hgbt_no_cst, X, y, cv=ts_cv, scoring=scorer)
+rmse = cv_results["test_score"]
 print(f"RMSE without constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}")
 
-cv_results = cross_validate(
-    hgbt_cst,
-    X,
-    y,
-    cv=ts_cv,
-    scoring="neg_root_mean_squared_error",
-)
-rmse = -cv_results["test_score"]
+cv_results = cross_validate(hgbt_cst, X, y, cv=ts_cv, scoring=scorer)
+rmse = cv_results["test_score"]
 print(f"RMSE with constraints    = {rmse.mean():.3f} +/- {rmse.std():.3f}")
 
 # %%

From 9f49ad5affa408b503067018782522019b9c4f9e Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Tue, 23 Jan 2024 10:55:27 +0100
Subject: [PATCH 48/52] Update examples/ensemble/plot_adaboost_regression.py

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 examples/ensemble/plot_adaboost_regression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py
index 98d3699ab161c..916d17addff18 100644
--- a/examples/ensemble/plot_adaboost_regression.py
+++ b/examples/ensemble/plot_adaboost_regression.py
@@ -10,7 +10,7 @@
 detail.
 
 See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an
-example showcasing the benefits of using more robust regressions such as
+example showcasing the benefits of using more efficient regression models such as
 :class:`~ensemble.HistGradientBoostingRegressor`.
 
 .. [1] `H. Drucker, "Improving Regressors using Boosting Techniques", 1997.

From d333c6d8dd276e935fe7ce96d68e17e809dc4d14 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Tue, 23 Jan 2024 10:56:10 +0100
Subject: [PATCH 49/52] Format

---
 examples/ensemble/plot_adaboost_regression.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py
index 916d17addff18..8ba01df63b561 100644
--- a/examples/ensemble/plot_adaboost_regression.py
+++ b/examples/ensemble/plot_adaboost_regression.py
@@ -10,8 +10,8 @@
 detail.
 
 See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an
-example showcasing the benefits of using more efficient regression models such as
-:class:`~ensemble.HistGradientBoostingRegressor`.
+example showcasing the benefits of using more efficient regression models such
+as :class:`~ensemble.HistGradientBoostingRegressor`.
 
 .. [1] `H. Drucker, "Improving Regressors using Boosting Techniques", 1997.
         <https://citeseerx.ist.psu.edu/doc_view/pid/8d49e2dedb817f2c3330e74b63c5fc86d2399ce3>`_

From c4a79e64f88e47e708e81f117f55af70deb2010c Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Mon, 19 Feb 2024 14:46:20 +0100
Subject: [PATCH 50/52] Update examples/ensemble/plot_hgbt_regression.py

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 examples/ensemble/plot_hgbt_regression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 26531bc8fd9a6..796d2d17a76b2 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -325,7 +325,7 @@ def generate_missing_values(X, missing_fraction):
 # bias. Monotonic constraints can also be used to enforce specific regulatory
 # requirements, ensure compliance and align with ethical considerations.
 #
-# In the present example, the policy of transfering energy from Victoria to New
+# In the present example, the policy of transferring energy from Victoria to New
 # South Wales is meant to alleviate price fluctuations, meaning that the model
 # predictions have to enforce such goal, i.e. transfer should increase with
 # price and demand in New South Wales, but also decrease with price and demand

From 1010eccf26314fc7f51e4f217a93c0efa2f60e12 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 19 Feb 2024 14:54:30 +0100
Subject: [PATCH 51/52] Fix random_state

---
 examples/ensemble/plot_hgbt_regression.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 26531bc8fd9a6..38cad137e35b0 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -116,7 +116,9 @@
 average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax)
 
 for idx, max_iter in enumerate(max_iter_list):
-    hgbt = HistGradientBoostingRegressor(max_iter=max_iter, categorical_features=None)
+    hgbt = HistGradientBoostingRegressor(
+        max_iter=max_iter, categorical_features=None, random_state=42
+    )
     hgbt.fit(X_train, y_train)
 
     y_pred = hgbt.predict(X_test)

From 31db489ec87041b3974528eea3264cdf569562c6 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 19 Feb 2024 14:54:42 +0100
Subject: [PATCH 52/52] Wording as suggested by Guillaume

---
 examples/ensemble/plot_hgbt_regression.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 38cad137e35b0..3d18064e7e489 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -143,7 +143,9 @@
 # %%
 # With just a few iterations, HGBT models can achieve convergence (see
 # :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`),
-# meaning that adding more trees does not improve the model anymore.
+# meaning that adding more trees does not improve the model anymore. In the
+# figure above, 5 iterations are not enough to be able to predict. With 50
+# iterations, we are already able to do a good job.
 #
 # Instead of relying on `max_iter` alone to determine when to stop, the HGBT
 # implementation in scikit-learn supports early stopping. With it, the model