From 42400749b97a7f4d4ac750a295a86d9b5d9fd7cb Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 2 Aug 2023 11:19:50 +0200
Subject: [PATCH 01/52] DOC Add example showcasing HGBT regression
---
doc/modules/ensemble.rst | 3 +-
examples/ensemble/plot_hgbt_regression.py | 448 ++++++++++++++++++
.../plot_release_highlights_1_1_0.py | 2 +
3 files changed, 452 insertions(+), 1 deletion(-)
create mode 100644 examples/ensemble/plot_hgbt_regression.py
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 36eed98da0f6b..0585f8289ed55 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -129,6 +129,8 @@ Note that for technical reasons, using a scorer is significantly slower than
using the loss. By default, early-stopping is performed if there are at least
10,000 samples in the training set, and uses the validation loss.
+.. _nan_support_hgbt:
+
Missing values support
^^^^^^^^^^^^^^^^^^^^^^
@@ -1634,4 +1636,3 @@ minimum required number of samples to consider a split ``min_samples_split``).
.. [HTF] T. Hastie, R. Tibshirani and J. Friedman, "Elements of
Statistical Learning Ed. 2", Springer, 2009.
-
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
new file mode 100644
index 0000000000000..ba651563a9d33
--- /dev/null
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -0,0 +1,448 @@
+"""
+===========================================================
+Decision Tree Regression with HistGradientBoostingRegressor
+===========================================================
+
+:ref:`histogram_based_gradient_boosting` (HGBT) models can be a competitive
+alternative to random forests, especially when the number of samples is larger
+than tens of thousands of samples (see
+:ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
+
+HGBT models have additional advantages such as:
+
+- :ref:`categorical_support_gbdt` (see
+ :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`)
+- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
+- :ref:`Quantile loss support `
+- :ref:`monotonic_cst_gbdt`
+
+This example aims at showcasing the last three points in a real setting.
+"""
+
+# %%
+# Author: Arturo Amor
+#
+# License: BSD 3 clause
+#
+# Preparing the data
+# ==================
+# The `electricity dataset `_ consists of data
+# collected from the Australian New South Wales Electricity Market. In this
+# market, prices are not fixed and are affected by supply and demand. They are
+# set every five minutes. Electricity transfers to/from the neighboring state of
+# Victoria were done to alleviate fluctuations.
+#
+# The dataset (originally named ELEC2) contains 45,312 instances dated from 7
+# May 1996 to 5 December 1998. Each example of the dataset refers to a period of
+# 30 minutes, i.e. there are 48 instances for each time period of one day. Each
+# example on the dataset has 5 fields, the day of week, the time stamp, the New
+# South Wales electricity demand, the Victoria electricity demand. It is
+# originally a classification task, but here we use it as a regression where the
+# target is the scheduled electricity transfer between states.
+
+from sklearn.datasets import fetch_openml
+
+electricity = fetch_openml(
+ name="electricity", version=1, as_frame=True, parser="pandas"
+)
+df = electricity.frame
+X = df.drop(columns=["transfer", "class"])
+y = df["transfer"]
+X
+
+# %%
+# Let us explore the hourly electricity transfer over different days of the week:
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+colors = sns.color_palette("colorblind")
+
+fig, ax = plt.subplots(figsize=(15, 10))
+pointplot = sns.pointplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax)
+handles, lables = ax.get_legend_handles_labels()
+ax.set(
+ title="Hourly energy transfer for different days of the week",
+ xticks=[i * 2 for i in range(24)],
+ xticklabels=list(range(24)),
+ xlabel="Time of the day",
+ ylabel="Normalized energy transfer",
+)
+_ = ax.legend(handles, ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"])
+
+# %%
+# Notice energy transfer increases systematically during weekends.
+#
+# Effect of number of trees in HistGradientBoostingRegressor
+# ==========================================================
+# For the sake of illustrating the effect of the (maximum) number of trees, we
+# train a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` over the
+# daily electricity transfer using the whole dataset. Then we visualize its
+# predictions depending on the `max_iter` parameter.
+
+from sklearn.ensemble import HistGradientBoostingRegressor
+
+max_iter_list = [10, 50]
+
+fig, ax = plt.subplots(figsize=(12, 4))
+average_week_demand = df.groupby(["day", "period"])["transfer"].mean()
+average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax)
+
+for idx, max_iter in enumerate(max_iter_list):
+ hgbt = HistGradientBoostingRegressor(max_iter=max_iter)
+ hgbt.fit(X, y)
+ y_pred = hgbt.predict(X)
+ prediction_df = df.copy()
+ prediction_df["y_pred"] = y_pred
+ average_pred = prediction_df.groupby(["day", "period"])["y_pred"].mean()
+ average_pred.plot(
+ color=colors[idx + 1], label=f"max_iter={max_iter}", linewidth=2, ax=ax
+ )
+ax.set(
+ title="Average daily energy transfer during the week",
+ xticks=[(i + 0.2) * 48 for i in range(7)],
+ xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
+ xlabel="Time of the week",
+ ylabel="Normalized energy transfer",
+)
+_ = ax.legend()
+
+# %%
+# With just a few iterations, HGBT models can achieve convergence (see
+# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
+#
+# Support for missing values
+# ==========================
+# HGBT models have native support of missing values. During training, the tree
+# grower decides where samples with missing values should go (left or right
+# child) at each split, based on the potential gain. When predicting, these
+# samples are sent to either child accordingly. If a feature had no missing
+# values during training, samples with missing values for that feature are sent
+# to the child with the most samples.
+#
+# Missing Completely At Random (MCAR)
+# -----------------------------------
+#
+# The missingness does not depend on the observed data or the unobserved data.
+# It's completely random. We can simulate such scenario by randomly replacing
+# values from randomly selected features with `Nan` values.
+
+import numpy as np
+
+from sklearn.model_selection import TimeSeriesSplit, cross_validate
+
+np.random.seed(42)
+
+ts_cv = TimeSeriesSplit(n_splits=5, gap=48, max_train_size=10000, test_size=1000)
+train_0, test_0 = next(ts_cv.split(df))
+last_days = slice(-192, None)
+total_cells = X.shape[0] * X.shape[1]
+missing_fraction_list = [0, 0.01, 0.03]
+
+fig, ax = plt.subplots(figsize=(12, 6))
+ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+hgbt = HistGradientBoostingRegressor()
+
+for missing_fraction in missing_fraction_list:
+ num_missing_cells = int(total_cells * missing_fraction)
+ row_indices = np.random.choice(X.shape[0], num_missing_cells, replace=True)
+ col_indices = np.random.choice(X.shape[1], num_missing_cells, replace=True)
+ X = df.drop(columns=["transfer", "class"])
+ X.iloc[row_indices, col_indices] = np.nan
+
+ hgbt.fit(X.iloc[train_0], y.iloc[train_0])
+ hgbt_predictions = hgbt.predict(X.iloc[test_0])
+ cv_results = cross_validate(
+ hgbt,
+ X,
+ y,
+ cv=ts_cv,
+ scoring="neg_root_mean_squared_error",
+ )
+ rmse = -cv_results["test_score"]
+ ax.plot(
+ hgbt_predictions[last_days],
+ label=(
+ f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
+ f" {rmse.std():.2f}"
+ ),
+ alpha=0.5,
+ )
+ax.set(
+ title="Daily energy transfer predictions on data with MCAR values",
+ xticks=[(i + 0.25) * 48 for i in range(4)],
+ xticklabels=["Tue", "Wed", "Thu", "Fri"],
+ xlabel="Time of the week",
+ ylabel="Normalized energy transfer",
+)
+_ = ax.legend()
+
+# %%
+# Missing At Random (MAR)
+# -----------------------
+#
+# The missingness depends on the observed data but never on unobserved data.
+# Here, the missingness in "vicdemand" is set to depend on the value of the
+# observed feature "nswprice".
+
+missing_fraction_list = [0, 0.5, 1.0]
+
+fig, ax = plt.subplots(figsize=(12, 6))
+ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+
+for missing_fraction in missing_fraction_list:
+ X = df.drop(columns=["transfer", "class"])
+ mask = X["nswprice"] < X["nswprice"].quantile(missing_fraction)
+ X["vicprice"] = X["vicprice"].mask(mask, np.nan)
+ X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
+
+ hgbt = HistGradientBoostingRegressor()
+ hgbt.fit(X.iloc[train_0], y.iloc[train_0])
+ hgbt_predictions = hgbt.predict(X.iloc[test_0])
+ cv_results = cross_validate(
+ hgbt,
+ X,
+ y,
+ cv=ts_cv,
+ scoring="neg_root_mean_squared_error",
+ )
+ rmse = -cv_results["test_score"]
+ ax.plot(
+ hgbt_predictions[last_days],
+ label=(
+ f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
+ f" {rmse.std():.2f}"
+ ),
+ alpha=0.5,
+ )
+ax.set(
+ title="Daily energy transfer predictions on data with MAR values",
+ xticks=[(i + 0.25) * 48 for i in range(4)],
+ xticklabels=["Tue", "Wed", "Thu", "Fri"],
+ xlabel="Time of the week",
+ ylabel="Normalized energy transfer",
+)
+_ = ax.legend()
+
+# %%
+# In this case the features are highly correlated and therefore MAR values
+# do not degrade the predictivity of the model even when completely removing
+# the feature "vicprice".
+#
+# Missing Not At Random (MNAR)
+# ----------------------------
+#
+# The missingness depends on the unobserved data. In particular, if the
+# probability of a value being missing in a variable is dependent on the values
+# of that variable itself. Here, we set the missingness to depend on the
+# unobserved feature "class".
+
+import pandas as pd
+
+fig, ax = plt.subplots(figsize=(12, 6))
+ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+
+for missing_fraction in missing_fraction_list:
+ X = df.drop(columns=["transfer", "class"])
+ mask = df["class"] == "DOWN"
+ true_indices = mask[mask].index
+ n_keep = int(len(true_indices) * missing_fraction)
+ keep_indices = np.random.choice(true_indices, size=n_keep, replace=False)
+ mask = pd.Series(False, index=mask.index)
+
+ # Set the randomly selected true indices to True in the new mask
+ mask.loc[keep_indices] = True
+ X["vicprice"] = X["vicprice"].mask(mask, np.nan)
+ X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
+
+ hgbt = HistGradientBoostingRegressor()
+ hgbt.fit(X.iloc[train_0], y.iloc[train_0])
+ hgbt_predictions = hgbt.predict(X.iloc[test_0])
+ cv_results = cross_validate(
+ hgbt,
+ X,
+ y,
+ cv=ts_cv,
+ scoring="neg_root_mean_squared_error",
+ )
+ rmse = -cv_results["test_score"]
+ ax.plot(
+ hgbt_predictions[last_days],
+ label=(
+ f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
+ f" {rmse.std():.2f}"
+ ),
+ alpha=0.5,
+ )
+ax.set(
+ title="Daily energy transfer predictions on data with MNAR values",
+ xticks=[(i + 0.25) * 48 for i in range(4)],
+ xticklabels=["Tue", "Wed", "Thu", "Fri"],
+ xlabel="Time of the week",
+ ylabel="Normalized energy transfer",
+)
+_ = ax.legend()
+
+# %%
+# Support for quantile loss
+# =========================
+#
+# The quantile loss in regression enables a view of the potential variability in
+# predictions. For instance, predicting the 5th and 95th percentiles can provide
+# a 90% prediction interval, i.e. the range within which we expect the true
+# value to fall with 90% probability.
+
+from sklearn.metrics import make_scorer, mean_pinball_loss
+
+quantiles = [0.95, 0.05]
+predictions = []
+X = df.drop(columns=["transfer", "class"])
+
+fig, ax = plt.subplots(figsize=(12, 6))
+ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+
+for quantile in quantiles:
+ hgbt = HistGradientBoostingRegressor(loss="quantile", quantile=quantile)
+ hgbt.fit(X.iloc[train_0], y.iloc[train_0])
+ hgbt_predictions = hgbt.predict(X.iloc[test_0])
+
+ predictions.append(hgbt_predictions)
+ cv_results = cross_validate(
+ hgbt,
+ X,
+ y,
+ cv=ts_cv,
+ scoring=make_scorer(mean_pinball_loss, alpha=quantile),
+ )
+ score = cv_results["test_score"]
+ ax.plot(
+ hgbt_predictions[last_days],
+ label=(
+ f"quantile={quantile}, pinball loss={score.mean():.3f} +/-"
+ f" {score.std():.3f}"
+ ),
+ alpha=0.5,
+ )
+
+ax.fill_between(
+ range(len(predictions[0][last_days])),
+ predictions[0][last_days],
+ predictions[1][last_days],
+ color=colors[0],
+ alpha=0.1,
+)
+ax.set(
+ title="Daily energy transfer predictions with quantile loss",
+ xticks=[(i + 0.25) * 48 for i in range(4)],
+ xticklabels=["Tue", "Wed", "Thu", "Fri"],
+ xlabel="Time of the week",
+ ylabel="Normalized energy transfer",
+)
+_ = ax.legend()
+
+# %%
+# Keep in mind that one can still improve the calibration of our model by:
+#
+# - collecting more data-points (in case the model is overfitting);
+# - better tuning of the model hyper-parameters (for instance you could try
+# max_iter=300, max_leaf_nodes=64) and make sure the model is not over-fitting
+# too much (e.g. by plotting the validation losses per boosting iteration and
+# using early stopping);
+# - engineering more predictive features from the same data. This is especially
+# useful for linear quantile regression (not covered in this tutorial);
+# - try other kinds of quantile regression models, for instance Quantile
+# Forests.
+#
+# Monotonic Constraints
+# ---------------------
+#
+# Given specific domain knowledge that requires the relationship between a
+# feature and the target to be monotonically increasing or decreasing, one can
+# enforce such behaviour in the predictions of a HGBT model using monotonic
+# constraints. This makes the model more interpretable and prevents overfitting.
+# Monotonic constraints can also be used to enforce specific regulatory
+# requirements, ensure compliance and align with ethical considerations.
+#
+# In the present example, the policy of transfering energy from Victoria to New
+# South Wales is meant to alleviate price fluctuations, meaning that the model
+# predictions have to enforce such goal, i.e. transfer should increase with
+# price and demand in New South Wales, but also decrease with price and demand
+# in Victoria, in order to benefit both populations.
+#
+# To create the monotonic constraints, we use :class:`numpy.select` to assign
+# `1` to the positions corresponding to columns "nswdemand" and "nswprice", `-1`
+# to the positions corresponding to columns "vicdemand" and "vicprice", and `0`
+# elsewhere. We then visualize the partial dependence on said features:
+
+from sklearn.inspection import PartialDependenceDisplay
+
+conditions = [
+ (X.columns == "nswdemand") | (X.columns == "nswprice"),
+ (X.columns == "vicdemand") | (X.columns == "vicprice"),
+]
+choices = [1, -1]
+
+monotonic_cst = np.select(conditions, choices, default=0)
+
+
+gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
+gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
+
+fig, ax = plt.subplots(nrows=2, figsize=(15, 10))
+disp = PartialDependenceDisplay.from_estimator(
+ gbdt_no_cst,
+ X,
+ features=["nswdemand", "nswprice"],
+ line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
+ ax=ax[0],
+)
+
+PartialDependenceDisplay.from_estimator(
+ gbdt_cst,
+ X,
+ features=["nswdemand", "nswprice"],
+ line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
+ ax=disp.axes_,
+)
+disp = PartialDependenceDisplay.from_estimator(
+ gbdt_no_cst,
+ X,
+ features=["vicdemand", "vicprice"],
+ line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
+ ax=ax[1],
+)
+
+PartialDependenceDisplay.from_estimator(
+ gbdt_cst,
+ X,
+ features=["vicdemand", "vicprice"],
+ line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
+ ax=disp.axes_,
+)
+
+plt.legend()
+plt.show()
+
+# %%
+# Indeed, we can verify that the predictive quality of the model is not degraded
+# by introducing the monotonic constraints:
+
+cv_results = cross_validate(
+ gbdt_no_cst,
+ X,
+ y,
+ cv=ts_cv,
+ scoring="neg_root_mean_squared_error",
+)
+rmse = -cv_results["test_score"]
+print(f"RMSE without constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}")
+
+cv_results = cross_validate(
+ gbdt_cst,
+ X,
+ y,
+ cv=ts_cv,
+ scoring="neg_root_mean_squared_error",
+)
+rmse = -cv_results["test_score"]
+print(f"RMSE with constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}")
diff --git a/examples/release_highlights/plot_release_highlights_1_1_0.py b/examples/release_highlights/plot_release_highlights_1_1_0.py
index f6432cf15037c..088919565315a 100644
--- a/examples/release_highlights/plot_release_highlights_1_1_0.py
+++ b/examples/release_highlights/plot_release_highlights_1_1_0.py
@@ -22,6 +22,8 @@
"""
# %%
+# .. _quantile_support_hgbdt:
+#
# Quantile loss in :class:`ensemble.HistGradientBoostingRegressor`
# ----------------------------------------------------------------
# :class:`~ensemble.HistGradientBoostingRegressor` can model quantiles with
From 9728566a790e2fee3eac2f7e027a9f02b14c1377 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 2 Aug 2023 11:48:51 +0200
Subject: [PATCH 02/52] Replace the landing-page figure
---
doc/templates/index.html | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/doc/templates/index.html b/doc/templates/index.html
index fc0362f4e379f..1a83f29f69e9f 100644
--- a/doc/templates/index.html
+++ b/doc/templates/index.html
@@ -70,8 +70,8 @@ Machine Learning in
and more...
Examples
From 7842e6d8b7c6b5e9a01452a262ed590a46756941 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 2 Aug 2023 16:20:04 +0200
Subject: [PATCH 03/52] Several tweaks
---
examples/ensemble/plot_hgbt_regression.py | 32 +++++++++--------------
1 file changed, 12 insertions(+), 20 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index ba651563a9d33..5f5a2fb6814e6 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -11,10 +11,10 @@
HGBT models have additional advantages such as:
- :ref:`categorical_support_gbdt` (see
- :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`)
+ :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-- :ref:`Quantile loss support `
-- :ref:`monotonic_cst_gbdt`
+- :ref:`Quantile loss support `.
+- :ref:`monotonic_cst_gbdt`.
This example aims at showcasing the last three points in a real setting.
"""
@@ -84,7 +84,7 @@
max_iter_list = [10, 50]
-fig, ax = plt.subplots(figsize=(12, 4))
+fig, ax = plt.subplots(figsize=(10, 5))
average_week_demand = df.groupby(["day", "period"])["transfer"].mean()
average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax)
@@ -243,7 +243,7 @@
ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
for missing_fraction in missing_fraction_list:
- X = df.drop(columns=["transfer", "class"])
+ X = df.drop(columns=["transfer", "class"]) # reset X
mask = df["class"] == "DOWN"
true_indices = mask[mask].index
n_keep = int(len(true_indices) * missing_fraction)
@@ -296,7 +296,7 @@
quantiles = [0.95, 0.05]
predictions = []
-X = df.drop(columns=["transfer", "class"])
+X = df.drop(columns=["transfer", "class"]) # reset X
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
@@ -344,17 +344,15 @@
# Keep in mind that one can still improve the calibration of our model by:
#
# - collecting more data-points (in case the model is overfitting);
-# - better tuning of the model hyper-parameters (for instance you could try
-# max_iter=300, max_leaf_nodes=64) and make sure the model is not over-fitting
-# too much (e.g. by plotting the validation losses per boosting iteration and
-# using early stopping);
+# - better tuning of the model hyper-parameters (see
+# :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`)
+# and make sure the model is not over-fitting;
# - engineering more predictive features from the same data. This is especially
# useful for linear quantile regression (not covered in this tutorial);
-# - try other kinds of quantile regression models, for instance Quantile
-# Forests.
+# - try other kinds of quantile regression models, such as Quantile Forests.
#
# Monotonic Constraints
-# ---------------------
+# =====================
#
# Given specific domain knowledge that requires the relationship between a
# feature and the target to be monotonically increasing or decreasing, one can
@@ -381,10 +379,8 @@
(X.columns == "vicdemand") | (X.columns == "vicprice"),
]
choices = [1, -1]
-
monotonic_cst = np.select(conditions, choices, default=0)
-
gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
@@ -396,7 +392,6 @@
line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
ax=ax[0],
)
-
PartialDependenceDisplay.from_estimator(
gbdt_cst,
X,
@@ -411,7 +406,6 @@
line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
ax=ax[1],
)
-
PartialDependenceDisplay.from_estimator(
gbdt_cst,
X,
@@ -419,9 +413,7 @@
line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
ax=disp.axes_,
)
-
-plt.legend()
-plt.show()
+_ = plt.legend()
# %%
# Indeed, we can verify that the predictive quality of the model is not degraded
From f5ac584476e7843fca4eb52e419e7305d7f91e93 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 2 Aug 2023 16:26:35 +0200
Subject: [PATCH 04/52] Wording
---
examples/ensemble/plot_hgbt_regression.py | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 5f5a2fb6814e6..c8aaa86f71d0b 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -344,12 +344,9 @@
# Keep in mind that one can still improve the calibration of our model by:
#
# - collecting more data-points (in case the model is overfitting);
-# - better tuning of the model hyper-parameters (see
-# :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`)
-# and make sure the model is not over-fitting;
-# - engineering more predictive features from the same data. This is especially
-# useful for linear quantile regression (not covered in this tutorial);
-# - try other kinds of quantile regression models, such as Quantile Forests.
+# - better tuning of the model hyperparameters (see
+# :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`);
+# - engineering more predictive features from the same data.
#
# Monotonic Constraints
# =====================
From 353329db25288c97d8cd3c5d2ecb08995f12d830 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 2 Aug 2023 18:02:41 +0200
Subject: [PATCH 05/52] Add cross-links from other examples
---
examples/ensemble/plot_adaboost_regression.py | 4 ++++
.../ensemble/plot_forest_hist_grad_boosting_comparison.py | 4 +++-
examples/ensemble/plot_gradient_boosting_categorical.py | 4 ++++
examples/ensemble/plot_gradient_boosting_quantile.py | 4 +++-
examples/ensemble/plot_gradient_boosting_regression.py | 5 ++++-
examples/ensemble/plot_hgbt_regression.py | 3 ++-
.../release_highlights/plot_release_highlights_0_23_0.py | 3 ++-
examples/release_highlights/plot_release_highlights_1_1_0.py | 3 +++
examples/release_highlights/plot_release_highlights_1_3_0.py | 4 +++-
9 files changed, 28 insertions(+), 6 deletions(-)
diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py
index c2aa7e558c07d..98d3699ab161c 100644
--- a/examples/ensemble/plot_adaboost_regression.py
+++ b/examples/ensemble/plot_adaboost_regression.py
@@ -9,6 +9,10 @@
regressor. As the number of boosts is increased the regressor can fit more
detail.
+See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an
+example showcasing the benefits of using more robust regressions such as
+:class:`~ensemble.HistGradientBoostingRegressor`.
+
.. [1] `H. Drucker, "Improving Regressors using Boosting Techniques", 1997.
`_
diff --git a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py
index 0dde24116065d..853caec241491 100644
--- a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py
+++ b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py
@@ -22,7 +22,9 @@
the predicted value. RFs, on the other hand, are based on bagging and use a
majority vote to predict the outcome.
-For more information on ensemble models, see the :ref:`User Guide `.
+See the :ref:`User Guide ` for more information on ensemble models or
+see :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an
+example showcasing some other features of HGBT models.
"""
# Author: Arturo Amor
diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py
index 0dd0a84243b4d..d9566f19a8214 100644
--- a/examples/ensemble/plot_gradient_boosting_categorical.py
+++ b/examples/ensemble/plot_gradient_boosting_categorical.py
@@ -21,6 +21,10 @@
We will work with the Ames Lowa Housing dataset which consists of numerical
and categorical features, where the houses' sales prices is the target.
+See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an
+example showcasing some other features of
+:class:`~ensemble.HistGradientBoostingRegressor`.
+
"""
# %%
diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
index d1464ba92c572..41378db704600 100644
--- a/examples/ensemble/plot_gradient_boosting_quantile.py
+++ b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -4,7 +4,9 @@
=====================================================
This example shows how quantile regression can be used to create prediction
-intervals.
+intervals. See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`
+for an example showcasing some other features of
+:class:`~ensemble.HistGradientBoostingRegressor`.
"""
diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index 94705ccfeca24..76437680708be 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -11,7 +11,10 @@
and 500 regression trees of depth 4.
Note: For larger datasets (n_samples >= 10000), please refer to
-:class:`~sklearn.ensemble.HistGradientBoostingRegressor`.
+:class:`~sklearn.ensemble.HistGradientBoostingRegressor`. See
+:ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an example
+showcasing some other advantages of
+:class:`~ensemble.HistGradientBoostingRegressor`.
"""
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index c8aaa86f71d0b..0040f261ebbbd 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -346,7 +346,8 @@
# - collecting more data-points (in case the model is overfitting);
# - better tuning of the model hyperparameters (see
# :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`);
-# - engineering more predictive features from the same data.
+# - engineering more predictive features from the same data (see
+# :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`).
#
# Monotonic Constraints
# =====================
diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py
index 7c6836632e3f0..7753f8653799e 100644
--- a/examples/release_highlights/plot_release_highlights_0_23_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_23_0.py
@@ -122,7 +122,8 @@
# specific features. In the following example, we construct a target that is
# generally positively correlated with the first feature, with some noise.
# Applying monotoinc constraints allows the prediction to capture the global
-# effect of the first feature, instead of fitting the noise.
+# effect of the first feature, instead of fitting the noise. For a usecase
+# example, see :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`.
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
diff --git a/examples/release_highlights/plot_release_highlights_1_1_0.py b/examples/release_highlights/plot_release_highlights_1_1_0.py
index 088919565315a..63c22d2f22fe5 100644
--- a/examples/release_highlights/plot_release_highlights_1_1_0.py
+++ b/examples/release_highlights/plot_release_highlights_1_1_0.py
@@ -53,6 +53,9 @@
ax.plot(X_1d, hist.predict(X), label=quantile)
_ = ax.legend(loc="lower left")
+# %%
+# For a usecase example, see
+# :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`
# %%
# `get_feature_names_out` Available in all Transformers
diff --git a/examples/release_highlights/plot_release_highlights_1_3_0.py b/examples/release_highlights/plot_release_highlights_1_3_0.py
index 8fa1ea057ac91..993a91d18100a 100644
--- a/examples/release_highlights/plot_release_highlights_1_3_0.py
+++ b/examples/release_highlights/plot_release_highlights_1_3_0.py
@@ -88,7 +88,9 @@
# :class:`tree.DecisionTreeRegressor` now support missing values. For each potential
# threshold on the non-missing data, the splitter will evaluate the split with all the
# missing values going to the left node or the right node.
-# More details in the :ref:`User Guide `.
+# See more details in the :ref:`User Guide ` or see
+# :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for a usecase
+# example of this feature in :class:`~ensemble.HistGradientBoostingRegressor`.
import numpy as np
from sklearn.tree import DecisionTreeClassifier
From 1d56abdda9cc1c5fcdf88170f1dfb3ccc00eabb1 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 2 Aug 2023 18:03:17 +0200
Subject: [PATCH 06/52] Use dictionary to define monotonic_cst
---
examples/ensemble/plot_hgbt_regression.py | 25 +++++++++++++----------
1 file changed, 14 insertions(+), 11 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 0040f261ebbbd..4e7deb0e6eb11 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -365,20 +365,23 @@
# price and demand in New South Wales, but also decrease with price and demand
# in Victoria, in order to benefit both populations.
#
-# To create the monotonic constraints, we use :class:`numpy.select` to assign
-# `1` to the positions corresponding to columns "nswdemand" and "nswprice", `-1`
-# to the positions corresponding to columns "vicdemand" and "vicprice", and `0`
-# elsewhere. We then visualize the partial dependence on said features:
+# If the training data has feature names, it’s possible to specify the monotonic
+# constraints by passing a dictionary with the convention:
+# - 1: monotonic increase
+# - 0: no constraint
+# - -1: monotonic decrease
from sklearn.inspection import PartialDependenceDisplay
-conditions = [
- (X.columns == "nswdemand") | (X.columns == "nswprice"),
- (X.columns == "vicdemand") | (X.columns == "vicprice"),
-]
-choices = [1, -1]
-monotonic_cst = np.select(conditions, choices, default=0)
-
+monotonic_cst = {
+ "date": 0,
+ "day": 0,
+ "period": 0,
+ "nswdemand": 1,
+ "nswprice": 1,
+ "vicdemand": -1,
+ "vicprice": -1,
+}
gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
From ff89b7c968f6e1d5e8e53fd048ea956bba303aa3 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Thu, 3 Aug 2023 11:17:32 +0200
Subject: [PATCH 07/52] Add cross-links in the documentation
---
doc/modules/ensemble.rst | 8 +++++++-
examples/ensemble/plot_hgbt_regression.py | 1 +
.../ensemble/_hist_gradient_boosting/gradient_boosting.py | 2 ++
3 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 0585f8289ed55..711cbb6c1f891 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -80,7 +80,8 @@ are not yet supported, for instance some loss functions.
.. topic:: Examples:
- * :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py`
+ * :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py`
+ * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`
Usage
^^^^^
@@ -169,6 +170,10 @@ If no missing values were encountered for a given feature during training,
then samples with missing values are mapped to whichever child has the most
samples.
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`
+
.. _sw_hgbdt:
Sample weight support
@@ -317,6 +322,7 @@ Also, monotonic constraints are not supported for multiclass classification.
.. topic:: Examples:
* :ref:`sphx_glr_auto_examples_ensemble_plot_monotonic_constraints.py`
+ * :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`
.. _interaction_cst_hgbt:
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 4e7deb0e6eb11..42e516fb96cbf 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -367,6 +367,7 @@
#
# If the training data has feature names, it’s possible to specify the monotonic
# constraints by passing a dictionary with the convention:
+#
# - 1: monotonic increase
# - 0: no constraint
# - -1: monotonic decrease
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 5d030d3add5bb..9d6b22b6519f1 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -1200,6 +1200,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
assigned to the left or right child consequently. If no missing values
were encountered for a given feature during training, then samples with
missing values are mapped to whichever child has the most samples.
+ See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for a
+ usecase example of this feature.
This implementation is inspired by
`LightGBM `_.
From 543d2803d11c5851ccd9717f00b430561b78ad43 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Thu, 3 Aug 2023 11:27:04 +0200
Subject: [PATCH 08/52] Change title
---
examples/ensemble/plot_hgbt_regression.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 42e516fb96cbf..1fedd06f9af21 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -1,7 +1,7 @@
"""
-===========================================================
-Decision Tree Regression with HistGradientBoostingRegressor
-===========================================================
+=============================================================
+Usecase of advanced features in Histogram Boosting Regression
+=============================================================
:ref:`histogram_based_gradient_boosting` (HGBT) models can be a competitive
alternative to random forests, especially when the number of samples is larger
From b77ab5c152c669282a42e43b8c155a3c02841038 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Thu, 7 Sep 2023 11:43:37 +0200
Subject: [PATCH 09/52] Apply suggestions from code review
Co-authored-by: Christian Lorentzen
---
examples/ensemble/plot_hgbt_regression.py | 19 ++++++++++++-------
1 file changed, 12 insertions(+), 7 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 1fedd06f9af21..b07ae7dcca05f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -3,20 +3,25 @@
Usecase of advanced features in Histogram Boosting Regression
=============================================================
-:ref:`histogram_based_gradient_boosting` (HGBT) models can be a competitive
-alternative to random forests, especially when the number of samples is larger
+:ref:`histogram_based_gradient_boosting` (HGBT) may be the most useful supervised learning model in scikit-learn. It is a modern gradient boosting implementation
+comparable to LightGBM and XGBoost. As such, it is more feature rich than and often
+outperforms alternative models like random forests, especially when the number of samples is larger
than tens of thousands of samples (see
:ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
-HGBT models have additional advantages such as:
+The top usability features of HGBT models are:
- :ref:`categorical_support_gbdt` (see
:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
- :ref:`Quantile loss support `.
- :ref:`monotonic_cst_gbdt`.
+- :ref:`_interaction_cst_hgbt`.
+- early stopping
-This example aims at showcasing the last three points in a real setting.
+Note that random forests have none of those capabilities.
+
+This example aims at showcasing points 2-4 in a real life setting.
"""
# %%
@@ -287,8 +292,8 @@
# Support for quantile loss
# =========================
#
-# The quantile loss in regression enables a view of the potential variability in
-# predictions. For instance, predicting the 5th and 95th percentiles can provide
+# The quantile loss in regression enables a view of the variability or uncertainty
+# of the target variable. For instance, predicting the 5th and 95th percentiles can provide
# a 90% prediction interval, i.e. the range within which we expect the true
# value to fall with 90% probability.
@@ -349,7 +354,7 @@
# - engineering more predictive features from the same data (see
# :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`).
#
-# Monotonic Constraints
+# Monotonic constraints
# =====================
#
# Given specific domain knowledge that requires the relationship between a
From 4689b0f890a6d455e9a74757b88821f0f69ecf90 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Thu, 7 Sep 2023 11:52:24 +0200
Subject: [PATCH 10/52] Iter on suggestions from code-review
---
examples/ensemble/plot_hgbt_regression.py | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index b07ae7dcca05f..afa1a5dac953b 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -3,10 +3,12 @@
Usecase of advanced features in Histogram Boosting Regression
=============================================================
-:ref:`histogram_based_gradient_boosting` (HGBT) may be the most useful supervised learning model in scikit-learn. It is a modern gradient boosting implementation
-comparable to LightGBM and XGBoost. As such, it is more feature rich than and often
-outperforms alternative models like random forests, especially when the number of samples is larger
-than tens of thousands of samples (see
+:ref:`histogram_based_gradient_boosting` (HGBT) models may be the most useful
+supervised learning models in scikit-learn. They are based on a modern gradient
+boosting implementation comparable to LightGBM and XGBoost. As such, HGBT models
+are more feature rich than -and often outperforms- alternative models like
+random forests, especially when the number of samples is larger than tens of
+thousands of samples (see
:ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
The top usability features of HGBT models are:
@@ -292,10 +294,10 @@
# Support for quantile loss
# =========================
#
-# The quantile loss in regression enables a view of the variability or uncertainty
-# of the target variable. For instance, predicting the 5th and 95th percentiles can provide
-# a 90% prediction interval, i.e. the range within which we expect the true
-# value to fall with 90% probability.
+# The quantile loss in regression enables a view of the variability or
+# uncertainty of the target variable. For instance, predicting the 5th and 95th
+# percentiles can provide a 90% prediction interval, i.e. the range within which
+# we expect the true value to fall with 90% probability.
from sklearn.metrics import make_scorer, mean_pinball_loss
From 86f8f6785161a6ceea99889d3fd40eb85648f5e5 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Tue, 3 Oct 2023 12:18:51 +0200
Subject: [PATCH 11/52] Remove comment that will no longer be true in v1.4
---
examples/ensemble/plot_hgbt_regression.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index afa1a5dac953b..551ddf1e5ff63 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -21,8 +21,6 @@
- :ref:`_interaction_cst_hgbt`.
- early stopping
-Note that random forests have none of those capabilities.
-
This example aims at showcasing points 2-4 in a real life setting.
"""
From 35c065ad56b8bccf1496c2842ff8cf84d55a8ef6 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Tue, 3 Oct 2023 12:26:55 +0200
Subject: [PATCH 12/52] Address comment from Christian on calibration
---
examples/ensemble/plot_hgbt_regression.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 551ddf1e5ff63..8e9935b9efa0e 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -346,7 +346,8 @@
_ = ax.legend()
# %%
-# Keep in mind that one can still improve the calibration of our model by:
+# Keep in mind that the predicted percentiles are just estimations that depend
+# on the model. One can still improve the quality of such estimations by:
#
# - collecting more data-points (in case the model is overfitting);
# - better tuning of the model hyperparameters (see
From c3e01fc768da1aa2a775c45e3655399fcae74878 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Tue, 3 Oct 2023 12:39:03 +0200
Subject: [PATCH 13/52] Address comment from Christian on bias
---
examples/ensemble/plot_hgbt_regression.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 8e9935b9efa0e..85eacafd09f37 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -361,8 +361,9 @@
# Given specific domain knowledge that requires the relationship between a
# feature and the target to be monotonically increasing or decreasing, one can
# enforce such behaviour in the predictions of a HGBT model using monotonic
-# constraints. This makes the model more interpretable and prevents overfitting.
-# Monotonic constraints can also be used to enforce specific regulatory
+# constraints. This makes the model more interpretable and can reduce its
+# variance (and potentially mitigate overfitting) at the risk of increasing
+# bias. Monotonic constraints can also be used to enforce specific regulatory
# requirements, ensure compliance and align with ethical considerations.
#
# In the present example, the policy of transfering energy from Victoria to New
From 093b8dd903132e4201f7b8ed3c4f82fba654c888 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Wed, 4 Oct 2023 14:13:14 +0200
Subject: [PATCH 14/52] Apply suggestions from code review
Co-authored-by: Christian Lorentzen
---
examples/ensemble/plot_hgbt_regression.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 85eacafd09f37..73aacb2a0356f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -16,7 +16,7 @@
- :ref:`categorical_support_gbdt` (see
:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-- :ref:`Quantile loss support `.
+- Support for several losses such as the :ref:`Quantile loss `.
- :ref:`monotonic_cst_gbdt`.
- :ref:`_interaction_cst_hgbt`.
- early stopping
@@ -346,10 +346,10 @@
_ = ax.legend()
# %%
-# Keep in mind that the predicted percentiles are just estimations that depend
-# on the model. One can still improve the quality of such estimations by:
+# Keep in mind that those predicted percentiles are just estimations from a
+# model. One can still improve the quality of such estimations by:
#
-# - collecting more data-points (in case the model is overfitting);
+# - collecting more data-points;
# - better tuning of the model hyperparameters (see
# :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`);
# - engineering more predictive features from the same data (see
From ff2888f02ac7dfa4034ffa77204c9f230da80bb2 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 4 Oct 2023 14:15:48 +0200
Subject: [PATCH 15/52] Iter on suggestions
---
examples/ensemble/plot_hgbt_regression.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 73aacb2a0356f..df4cf837f8ef4 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -18,7 +18,7 @@
- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
- Support for several losses such as the :ref:`Quantile loss `.
- :ref:`monotonic_cst_gbdt`.
-- :ref:`_interaction_cst_hgbt`.
+- :ref:`interaction_cst_hgbt`.
- early stopping
This example aims at showcasing points 2-4 in a real life setting.
From 74719599c77be61883e56331336de624edd975c3 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Fri, 6 Oct 2023 14:36:19 +0200
Subject: [PATCH 16/52] Silence warning from DataFrame.groupby
---
examples/ensemble/plot_hgbt_regression.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index df4cf837f8ef4..c9c031ea11125 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -90,7 +90,7 @@
max_iter_list = [10, 50]
fig, ax = plt.subplots(figsize=(10, 5))
-average_week_demand = df.groupby(["day", "period"])["transfer"].mean()
+average_week_demand = df.groupby(["day", "period"], observed=False)["transfer"].mean()
average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax)
for idx, max_iter in enumerate(max_iter_list):
@@ -99,7 +99,9 @@
y_pred = hgbt.predict(X)
prediction_df = df.copy()
prediction_df["y_pred"] = y_pred
- average_pred = prediction_df.groupby(["day", "period"])["y_pred"].mean()
+ average_pred = prediction_df.groupby(["day", "period"], observed=False)[
+ "y_pred"
+ ].mean()
average_pred.plot(
color=colors[idx + 1], label=f"max_iter={max_iter}", linewidth=2, ax=ax
)
From 9a486b896a5661ddcc77db31601318303b131310 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Fri, 6 Oct 2023 16:04:50 +0200
Subject: [PATCH 17/52] Add discussion on early stopping
---
examples/ensemble/plot_hgbt_regression.py | 67 +++++++++++++++++++----
1 file changed, 56 insertions(+), 11 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index c9c031ea11125..7553eb631ede2 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -15,13 +15,13 @@
- :ref:`categorical_support_gbdt` (see
:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
+- Early stopping.
- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
- Support for several losses such as the :ref:`Quantile loss `.
- :ref:`monotonic_cst_gbdt`.
- :ref:`interaction_cst_hgbt`.
-- early stopping
-This example aims at showcasing points 2-4 in a real life setting.
+This example aims at showcasing points 2-5 in a real life setting.
"""
# %%
@@ -78,8 +78,8 @@
# %%
# Notice energy transfer increases systematically during weekends.
#
-# Effect of number of trees in HistGradientBoostingRegressor
-# ==========================================================
+# Effect of number of trees and early stopping
+# ============================================
# For the sake of illustrating the effect of the (maximum) number of trees, we
# train a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` over the
# daily electricity transfer using the whole dataset. Then we visualize its
@@ -118,6 +118,52 @@
# With just a few iterations, HGBT models can achieve convergence (see
# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
#
+# Instead of relying solely on `max_iter` to determine when to stop, the HGBT
+# implementations in scikit-learn support early stopping. With it, the model
+# uses a fraction of the training data as a validation set
+# (`validation_fraction`) and stops training if the validation score does not
+# improve (or degrades) after `n_iter_no_change` iterations up to a certain
+# `tol`.
+#
+# Notice that there is a trade-off between `learning_rate` and `max_iter`:
+# Generally, smaller learning rates require more iterations to converge to the
+# minimum loss, while larger learning rates might converge faster but are at
+# risk of overfitting.
+#
+# Indeed, a good practice is to tune the learning rate along with any other
+# hyperparameters, fit the HBGT on the training set with a large enough value
+# for `max_iter` and determine the best `max_iter` via early stopping and some
+# explicit `validation_fraction`.
+
+common_params = {
+ "max_iter": 1_000,
+ "learning_rate": 0.3,
+ "validation_fraction": 0.2,
+ "random_state": 42,
+ "scoring": "neg_root_mean_squared_error",
+}
+
+hgbt = HistGradientBoostingRegressor(early_stopping=True, **common_params)
+hgbt.fit(X, y)
+plt.plot(-hgbt.validation_score_)
+plt.xlabel("number of iterations")
+plt.ylabel("root mean squared error")
+_ = plt.title(f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})")
+
+# %%
+# We can then overwrite the value for `max_iter` to a razonable value and avoid
+# the extra computational cost of the inner validation. In this case, rounding
+# up the number of iterations to 600 may account for variability of the training
+# set:
+
+common_params["max_iter"] = 600
+common_params["early_stopping"] = False
+hgbt = HistGradientBoostingRegressor(**common_params)
+
+# %%
+# .. note:: The inner validation done during early stopping is not optimal for
+# time series with the implementation as of scikit-learn v1.3.
+#
# Support for missing values
# ==========================
# HGBT models have native support of missing values. During training, the tree
@@ -148,7 +194,6 @@
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
-hgbt = HistGradientBoostingRegressor()
for missing_fraction in missing_fraction_list:
num_missing_cells = int(total_cells * missing_fraction)
@@ -203,7 +248,6 @@
X["vicprice"] = X["vicprice"].mask(mask, np.nan)
X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
- hgbt = HistGradientBoostingRegressor()
hgbt.fit(X.iloc[train_0], y.iloc[train_0])
hgbt_predictions = hgbt.predict(X.iloc[test_0])
cv_results = cross_validate(
@@ -262,7 +306,6 @@
X["vicprice"] = X["vicprice"].mask(mask, np.nan)
X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
- hgbt = HistGradientBoostingRegressor()
hgbt.fit(X.iloc[train_0], y.iloc[train_0])
hgbt_predictions = hgbt.predict(X.iloc[test_0])
cv_results = cross_validate(
@@ -309,13 +352,15 @@
ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
for quantile in quantiles:
- hgbt = HistGradientBoostingRegressor(loss="quantile", quantile=quantile)
- hgbt.fit(X.iloc[train_0], y.iloc[train_0])
- hgbt_predictions = hgbt.predict(X.iloc[test_0])
+ hgbt_quantile = HistGradientBoostingRegressor(
+ loss="quantile", quantile=quantile, **common_params
+ )
+ hgbt_quantile.fit(X.iloc[train_0], y.iloc[train_0])
+ hgbt_predictions = hgbt_quantile.predict(X.iloc[test_0])
predictions.append(hgbt_predictions)
cv_results = cross_validate(
- hgbt,
+ hgbt_quantile,
X,
y,
cv=ts_cv,
From 822f3db0a0b4ae538823215242890b8910708bf9 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Fri, 6 Oct 2023 16:05:20 +0200
Subject: [PATCH 18/52] Wording
---
examples/ensemble/plot_hgbt_regression.py | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 7553eb631ede2..706532edd5d86 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -425,6 +425,8 @@
# - 1: monotonic increase
# - 0: no constraint
# - -1: monotonic decrease
+#
+# Else, one can pass an array-like encoding the above convention by position.
from sklearn.inspection import PartialDependenceDisplay
@@ -472,8 +474,8 @@
_ = plt.legend()
# %%
-# Indeed, we can verify that the predictive quality of the model is not degraded
-# by introducing the monotonic constraints:
+# Indeed, we can verify that the predictive quality of the model is not
+# significantly degraded by introducing the monotonic constraints:
cv_results = cross_validate(
gbdt_no_cst,
@@ -494,3 +496,8 @@
)
rmse = -cv_results["test_score"]
print(f"RMSE with constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}")
+
+# %%
+# That being said, notice the comparison is between to different models that may
+# be optimized by a different combination of hyperparameters. That is the reason
+# why we do no use the `common_params` in this section as done before.
From 97cf6426a566a241dc72b28477c19a63f4ed7360 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Fri, 6 Oct 2023 16:10:13 +0200
Subject: [PATCH 19/52] Rename instances of hgbt
---
examples/ensemble/plot_hgbt_regression.py | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 706532edd5d86..a8d50a5a4f8a7 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -439,33 +439,33 @@
"vicdemand": -1,
"vicprice": -1,
}
-gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
-gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
+hgbt_no_cst = HistGradientBoostingRegressor().fit(X, y)
+hgbt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
fig, ax = plt.subplots(nrows=2, figsize=(15, 10))
disp = PartialDependenceDisplay.from_estimator(
- gbdt_no_cst,
+ hgbt_no_cst,
X,
features=["nswdemand", "nswprice"],
line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
ax=ax[0],
)
PartialDependenceDisplay.from_estimator(
- gbdt_cst,
+ hgbt_cst,
X,
features=["nswdemand", "nswprice"],
line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
ax=disp.axes_,
)
disp = PartialDependenceDisplay.from_estimator(
- gbdt_no_cst,
+ hgbt_no_cst,
X,
features=["vicdemand", "vicprice"],
line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
ax=ax[1],
)
PartialDependenceDisplay.from_estimator(
- gbdt_cst,
+ hgbt_cst,
X,
features=["vicdemand", "vicprice"],
line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
@@ -478,7 +478,7 @@
# significantly degraded by introducing the monotonic constraints:
cv_results = cross_validate(
- gbdt_no_cst,
+ hgbt_no_cst,
X,
y,
cv=ts_cv,
@@ -488,7 +488,7 @@
print(f"RMSE without constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}")
cv_results = cross_validate(
- gbdt_cst,
+ hgbt_cst,
X,
y,
cv=ts_cv,
From 60d8f6118b87cfbd3e336e773a28f59de05714b4 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Fri, 6 Oct 2023 16:17:08 +0200
Subject: [PATCH 20/52] Remove distinction on type of missingness
---
examples/ensemble/plot_hgbt_regression.py | 114 +---------------------
1 file changed, 4 insertions(+), 110 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index a8d50a5a4f8a7..cde47747de12d 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -173,12 +173,10 @@
# values during training, samples with missing values for that feature are sent
# to the child with the most samples.
#
-# Missing Completely At Random (MCAR)
-# -----------------------------------
-#
-# The missingness does not depend on the observed data or the unobserved data.
-# It's completely random. We can simulate such scenario by randomly replacing
-# values from randomly selected features with `Nan` values.
+# The present example shows how HGBT regressions deal with values missing
+# completely at random (MCAR), i.e. the missingness does not depend on the
+# observed data or the unobserved data. We can simulate such scenario by
+# randomly replacing values from randomly selected features with `Nan` values.
import numpy as np
@@ -229,110 +227,6 @@
)
_ = ax.legend()
-# %%
-# Missing At Random (MAR)
-# -----------------------
-#
-# The missingness depends on the observed data but never on unobserved data.
-# Here, the missingness in "vicdemand" is set to depend on the value of the
-# observed feature "nswprice".
-
-missing_fraction_list = [0, 0.5, 1.0]
-
-fig, ax = plt.subplots(figsize=(12, 6))
-ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
-
-for missing_fraction in missing_fraction_list:
- X = df.drop(columns=["transfer", "class"])
- mask = X["nswprice"] < X["nswprice"].quantile(missing_fraction)
- X["vicprice"] = X["vicprice"].mask(mask, np.nan)
- X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
-
- hgbt.fit(X.iloc[train_0], y.iloc[train_0])
- hgbt_predictions = hgbt.predict(X.iloc[test_0])
- cv_results = cross_validate(
- hgbt,
- X,
- y,
- cv=ts_cv,
- scoring="neg_root_mean_squared_error",
- )
- rmse = -cv_results["test_score"]
- ax.plot(
- hgbt_predictions[last_days],
- label=(
- f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
- f" {rmse.std():.2f}"
- ),
- alpha=0.5,
- )
-ax.set(
- title="Daily energy transfer predictions on data with MAR values",
- xticks=[(i + 0.25) * 48 for i in range(4)],
- xticklabels=["Tue", "Wed", "Thu", "Fri"],
- xlabel="Time of the week",
- ylabel="Normalized energy transfer",
-)
-_ = ax.legend()
-
-# %%
-# In this case the features are highly correlated and therefore MAR values
-# do not degrade the predictivity of the model even when completely removing
-# the feature "vicprice".
-#
-# Missing Not At Random (MNAR)
-# ----------------------------
-#
-# The missingness depends on the unobserved data. In particular, if the
-# probability of a value being missing in a variable is dependent on the values
-# of that variable itself. Here, we set the missingness to depend on the
-# unobserved feature "class".
-
-import pandas as pd
-
-fig, ax = plt.subplots(figsize=(12, 6))
-ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
-
-for missing_fraction in missing_fraction_list:
- X = df.drop(columns=["transfer", "class"]) # reset X
- mask = df["class"] == "DOWN"
- true_indices = mask[mask].index
- n_keep = int(len(true_indices) * missing_fraction)
- keep_indices = np.random.choice(true_indices, size=n_keep, replace=False)
- mask = pd.Series(False, index=mask.index)
-
- # Set the randomly selected true indices to True in the new mask
- mask.loc[keep_indices] = True
- X["vicprice"] = X["vicprice"].mask(mask, np.nan)
- X["vicdemand"] = X["vicdemand"].mask(mask, np.nan)
-
- hgbt.fit(X.iloc[train_0], y.iloc[train_0])
- hgbt_predictions = hgbt.predict(X.iloc[test_0])
- cv_results = cross_validate(
- hgbt,
- X,
- y,
- cv=ts_cv,
- scoring="neg_root_mean_squared_error",
- )
- rmse = -cv_results["test_score"]
- ax.plot(
- hgbt_predictions[last_days],
- label=(
- f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
- f" {rmse.std():.2f}"
- ),
- alpha=0.5,
- )
-ax.set(
- title="Daily energy transfer predictions on data with MNAR values",
- xticks=[(i + 0.25) * 48 for i in range(4)],
- xticklabels=["Tue", "Wed", "Thu", "Fri"],
- xlabel="Time of the week",
- ylabel="Normalized energy transfer",
-)
-_ = ax.legend()
-
# %%
# Support for quantile loss
# =========================
From 8799932ad3bd8e19827bf3215adb73aafb7ce994 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Thu, 2 Nov 2023 11:17:11 +0100
Subject: [PATCH 21/52] Apply suggestions from code review
Co-authored-by: Guillaume Lemaitre
---
examples/ensemble/plot_hgbt_regression.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index cde47747de12d..6e548a54329ad 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -24,11 +24,11 @@
This example aims at showcasing points 2-5 in a real life setting.
"""
-# %%
+
# Author: Arturo Amor
-#
# License: BSD 3 clause
-#
+
+# %%
# Preparing the data
# ==================
# The `electricity dataset `_ consists of data
@@ -40,7 +40,7 @@
# The dataset (originally named ELEC2) contains 45,312 instances dated from 7
# May 1996 to 5 December 1998. Each example of the dataset refers to a period of
# 30 minutes, i.e. there are 48 instances for each time period of one day. Each
-# example on the dataset has 5 fields, the day of week, the time stamp, the New
+# example on the dataset has 5 fields: the day of week, the time stamp, the New
# South Wales electricity demand, the Victoria electricity demand. It is
# originally a classification task, but here we use it as a regression where the
# target is the scheduled electricity transfer between states.
@@ -151,7 +151,7 @@
_ = plt.title(f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})")
# %%
-# We can then overwrite the value for `max_iter` to a razonable value and avoid
+# We can then overwrite the value for `max_iter` to a reasonable value and avoid
# the extra computational cost of the inner validation. In this case, rounding
# up the number of iterations to 600 may account for variability of the training
# set:
From c3c883cbb32d3309229f4c54a9239633d8cabac0 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Thu, 2 Nov 2023 11:20:06 +0100
Subject: [PATCH 22/52] Use numbered list
---
examples/ensemble/plot_hgbt_regression.py | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 6e548a54329ad..70f063c73b9be 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -13,13 +13,14 @@
The top usability features of HGBT models are:
-- :ref:`categorical_support_gbdt` (see
+1. :ref:`categorical_support_gbdt` (see
:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
-- Early stopping.
-- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-- Support for several losses such as the :ref:`Quantile loss `.
-- :ref:`monotonic_cst_gbdt`.
-- :ref:`interaction_cst_hgbt`.
+1. Early stopping.
+1. :ref:`nan_support_hgbt`, which avoids the need for an imputer.
+1. Support for several losses such as the :ref:`Quantile loss
+ `.
+1. :ref:`monotonic_cst_gbdt`.
+1. :ref:`interaction_cst_hgbt`.
This example aims at showcasing points 2-5 in a real life setting.
"""
From 26ddf3baff54384447621abab683dc34c711d71e Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Thu, 9 Nov 2023 16:29:27 +0100
Subject: [PATCH 23/52] Prefer lineplot instead of pairplot
---
examples/ensemble/plot_hgbt_regression.py | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 70f063c73b9be..a9d4c35b8dc39 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -65,13 +65,11 @@
colors = sns.color_palette("colorblind")
fig, ax = plt.subplots(figsize=(15, 10))
-pointplot = sns.pointplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax)
+pointplot = sns.lineplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax)
handles, lables = ax.get_legend_handles_labels()
ax.set(
title="Hourly energy transfer for different days of the week",
- xticks=[i * 2 for i in range(24)],
- xticklabels=list(range(24)),
- xlabel="Time of the day",
+ xlabel="Normalized time of the day",
ylabel="Normalized energy transfer",
)
_ = ax.legend(handles, ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"])
From 4d700387724bc7d7d37645c994d52734227831a6 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Thu, 9 Nov 2023 16:31:34 +0100
Subject: [PATCH 24/52] Prefer sample over example
---
examples/ensemble/plot_hgbt_regression.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index a9d4c35b8dc39..9d215892c744d 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -39,9 +39,9 @@
# Victoria were done to alleviate fluctuations.
#
# The dataset (originally named ELEC2) contains 45,312 instances dated from 7
-# May 1996 to 5 December 1998. Each example of the dataset refers to a period of
+# May 1996 to 5 December 1998. Each sample of the dataset refers to a period of
# 30 minutes, i.e. there are 48 instances for each time period of one day. Each
-# example on the dataset has 5 fields: the day of week, the time stamp, the New
+# sample on the dataset has 5 fields: the day of week, the time stamp, the New
# South Wales electricity demand, the Victoria electricity demand. It is
# originally a classification task, but here we use it as a regression where the
# target is the scheduled electricity transfer between states.
From 5b0dcfd175b80d13a6203172a1cbd713f0c3c3ca Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Fri, 10 Nov 2023 11:51:20 +0100
Subject: [PATCH 25/52] Remove stepwise constant piece of dataset
---
examples/ensemble/plot_hgbt_regression.py | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 9d215892c744d..e79f19ceb1335 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -25,7 +25,6 @@
This example aims at showcasing points 2-5 in a real life setting.
"""
-
# Author: Arturo Amor
# License: BSD 3 clause
@@ -57,13 +56,23 @@
X
# %%
-# Let us explore the hourly electricity transfer over different days of the week:
+# This particular dataset has a stepwise constant target for the first 17,760
+# samples:
+
+y[:17760].unique()
+
+# %%
+# Let us drop those entries and explore the hourly electricity transfer over
+# different days of the week:
import matplotlib.pyplot as plt
import seaborn as sns
colors = sns.color_palette("colorblind")
+X = X.iloc[17760:]
+y = y.iloc[17760:]
+
fig, ax = plt.subplots(figsize=(15, 10))
pointplot = sns.lineplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax)
handles, lables = ax.get_legend_handles_labels()
From 29146ae8e63dbed075b103847158653a1e24c62f Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Fri, 10 Nov 2023 11:52:25 +0100
Subject: [PATCH 26/52] Plot predictions on unseen data
---
examples/ensemble/plot_hgbt_regression.py | 25 +++++++++++++++--------
1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index e79f19ceb1335..0babb87f56b4e 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -91,21 +91,29 @@
# For the sake of illustrating the effect of the (maximum) number of trees, we
# train a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` over the
# daily electricity transfer using the whole dataset. Then we visualize its
-# predictions depending on the `max_iter` parameter.
+# predictions depending on the `max_iter` parameter. Here we don't try to
+# evaluate the performance of the model and its capacity to generalize but
+# rather its capacity to learn from the training data.
from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.model_selection import train_test_split
-max_iter_list = [10, 50]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
+
+max_iter_list = [5, 50]
fig, ax = plt.subplots(figsize=(10, 5))
-average_week_demand = df.groupby(["day", "period"], observed=False)["transfer"].mean()
-average_week_demand.plot(color=colors[0], label="training data", linewidth=2, ax=ax)
+average_week_demand = (
+ df.loc[X_test.index].groupby(["day", "period"], observed=False)["transfer"].mean()
+)
+average_week_demand.plot(color=colors[0], label="ground truth", linewidth=2, ax=ax)
for idx, max_iter in enumerate(max_iter_list):
hgbt = HistGradientBoostingRegressor(max_iter=max_iter)
- hgbt.fit(X, y)
- y_pred = hgbt.predict(X)
- prediction_df = df.copy()
+ hgbt.fit(X_train, y_train)
+
+ y_pred = hgbt.predict(X_test)
+ prediction_df = df.loc[X_test.index].copy()
prediction_df["y_pred"] = y_pred
average_pred = prediction_df.groupby(["day", "period"], observed=False)[
"y_pred"
@@ -113,8 +121,9 @@
average_pred.plot(
color=colors[idx + 1], label=f"max_iter={max_iter}", linewidth=2, ax=ax
)
+
ax.set(
- title="Average daily energy transfer during the week",
+ title="Predicted average energy transfer during the week",
xticks=[(i + 0.2) * 48 for i in range(7)],
xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
xlabel="Time of the week",
From 25978aeff96746c6a01fd1efdc8e68f18167aac3 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Mon, 13 Nov 2023 12:01:40 +0100
Subject: [PATCH 27/52] Refactor code
---
examples/ensemble/plot_hgbt_regression.py | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 0babb87f56b4e..a01806845bc0f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -68,10 +68,9 @@
import matplotlib.pyplot as plt
import seaborn as sns
-colors = sns.color_palette("colorblind")
-
-X = X.iloc[17760:]
-y = y.iloc[17760:]
+df = electricity.frame.iloc[17760:]
+X = df.drop(columns=["transfer", "class"])
+y = df["transfer"]
fig, ax = plt.subplots(figsize=(15, 10))
pointplot = sns.lineplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax)
@@ -98,14 +97,14 @@
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
-
max_iter_list = [5, 50]
-
-fig, ax = plt.subplots(figsize=(10, 5))
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
average_week_demand = (
df.loc[X_test.index].groupby(["day", "period"], observed=False)["transfer"].mean()
)
+
+colors = sns.color_palette("colorblind")
+fig, ax = plt.subplots(figsize=(10, 5))
average_week_demand.plot(color=colors[0], label="ground truth", linewidth=2, ax=ax)
for idx, max_iter in enumerate(max_iter_list):
From 16a19b124f8475c0ceaab24e919e8add3b9e8d21 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Mon, 13 Nov 2023 12:02:17 +0100
Subject: [PATCH 28/52] Use train set for determining max_iter
---
examples/ensemble/plot_hgbt_regression.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index a01806845bc0f..c4ddadd9fffdf 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -160,7 +160,7 @@
}
hgbt = HistGradientBoostingRegressor(early_stopping=True, **common_params)
-hgbt.fit(X, y)
+hgbt.fit(X_train, y_train)
plt.plot(-hgbt.validation_score_)
plt.xlabel("number of iterations")
plt.ylabel("root mean squared error")
@@ -169,10 +169,10 @@
# %%
# We can then overwrite the value for `max_iter` to a reasonable value and avoid
# the extra computational cost of the inner validation. In this case, rounding
-# up the number of iterations to 600 may account for variability of the training
+# up the number of iterations to 400 may account for variability of the training
# set:
-common_params["max_iter"] = 600
+common_params["max_iter"] = 400
common_params["early_stopping"] = False
hgbt = HistGradientBoostingRegressor(**common_params)
From 70c021f2279b8d202be5b3cd6a68d88eee9dbf38 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Mon, 13 Nov 2023 16:38:03 +0100
Subject: [PATCH 29/52] Use test set for plots and add generate_missing_values
function
---
examples/ensemble/plot_hgbt_regression.py | 101 +++++++++-------------
1 file changed, 43 insertions(+), 58 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index c4ddadd9fffdf..607041397e38f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -196,52 +196,44 @@
import numpy as np
-from sklearn.model_selection import TimeSeriesSplit, cross_validate
+from sklearn.metrics import root_mean_squared_error
+
+rng = np.random.RandomState(42)
+first_week = slice(0, 336) # first week in the test set as 7 * 48 = 336
+missing_fraction_list = [0, 0.02, 0.05]
-np.random.seed(42)
-ts_cv = TimeSeriesSplit(n_splits=5, gap=48, max_train_size=10000, test_size=1000)
-train_0, test_0 = next(ts_cv.split(df))
-last_days = slice(-192, None)
-total_cells = X.shape[0] * X.shape[1]
-missing_fraction_list = [0, 0.01, 0.03]
+def generate_missing_values(X, missing_fraction):
+ total_cells = X.shape[0] * X.shape[1]
+ num_missing_cells = int(total_cells * missing_fraction)
+ row_indices = rng.choice(X.shape[0], num_missing_cells, replace=True)
+ col_indices = rng.choice(X.shape[1], num_missing_cells, replace=True)
+ X_missing = X.copy()
+ X_missing.iloc[row_indices, col_indices] = np.nan
+ return X_missing
+
fig, ax = plt.subplots(figsize=(12, 6))
-ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+ax.plot(y_test.values[first_week], label="Actual transfer")
for missing_fraction in missing_fraction_list:
- num_missing_cells = int(total_cells * missing_fraction)
- row_indices = np.random.choice(X.shape[0], num_missing_cells, replace=True)
- col_indices = np.random.choice(X.shape[1], num_missing_cells, replace=True)
- X = df.drop(columns=["transfer", "class"])
- X.iloc[row_indices, col_indices] = np.nan
-
- hgbt.fit(X.iloc[train_0], y.iloc[train_0])
- hgbt_predictions = hgbt.predict(X.iloc[test_0])
- cv_results = cross_validate(
- hgbt,
- X,
- y,
- cv=ts_cv,
- scoring="neg_root_mean_squared_error",
- )
- rmse = -cv_results["test_score"]
+ X_missing = generate_missing_values(X_train, missing_fraction)
+ hgbt.fit(X_missing, y_train)
+ y_pred = hgbt.predict(X_test[first_week])
+ rmse = root_mean_squared_error(y_test[first_week], y_pred)
ax.plot(
- hgbt_predictions[last_days],
- label=(
- f"missing_fraction={missing_fraction}, RMSE={rmse.mean():.2f} +/-"
- f" {rmse.std():.2f}"
- ),
+ y_pred[first_week],
+ label=f"missing_fraction={missing_fraction}, RMSE={rmse:.2f}",
alpha=0.5,
)
ax.set(
title="Daily energy transfer predictions on data with MCAR values",
- xticks=[(i + 0.25) * 48 for i in range(4)],
- xticklabels=["Tue", "Wed", "Thu", "Fri"],
+ xticks=[(i + 0.2) * 48 for i in range(7)],
+ xticklabels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"],
xlabel="Time of the week",
ylabel="Normalized energy transfer",
)
-_ = ax.legend()
+_ = ax.legend(loc="lower right")
# %%
# Support for quantile loss
@@ -252,55 +244,44 @@
# percentiles can provide a 90% prediction interval, i.e. the range within which
# we expect the true value to fall with 90% probability.
-from sklearn.metrics import make_scorer, mean_pinball_loss
+from sklearn.metrics import mean_pinball_loss
quantiles = [0.95, 0.05]
predictions = []
-X = df.drop(columns=["transfer", "class"]) # reset X
fig, ax = plt.subplots(figsize=(12, 6))
-ax.plot(y.iloc[test_0].values[last_days], label="Actual transfer")
+ax.plot(y_test.values[first_week], label="Actual transfer")
for quantile in quantiles:
hgbt_quantile = HistGradientBoostingRegressor(
loss="quantile", quantile=quantile, **common_params
)
- hgbt_quantile.fit(X.iloc[train_0], y.iloc[train_0])
- hgbt_predictions = hgbt_quantile.predict(X.iloc[test_0])
-
- predictions.append(hgbt_predictions)
- cv_results = cross_validate(
- hgbt_quantile,
- X,
- y,
- cv=ts_cv,
- scoring=make_scorer(mean_pinball_loss, alpha=quantile),
- )
- score = cv_results["test_score"]
+ hgbt_quantile.fit(X_train, y_train)
+ y_pred = hgbt_quantile.predict(X_test[first_week])
+
+ predictions.append(y_pred)
+ score = mean_pinball_loss(y_test[first_week], y_pred)
ax.plot(
- hgbt_predictions[last_days],
- label=(
- f"quantile={quantile}, pinball loss={score.mean():.3f} +/-"
- f" {score.std():.3f}"
- ),
+ y_pred[first_week],
+ label=f"quantile={quantile}, pinball loss={score:.2f}",
alpha=0.5,
)
ax.fill_between(
- range(len(predictions[0][last_days])),
- predictions[0][last_days],
- predictions[1][last_days],
+ range(len(predictions[0][first_week])),
+ predictions[0][first_week],
+ predictions[1][first_week],
color=colors[0],
alpha=0.1,
)
ax.set(
title="Daily energy transfer predictions with quantile loss",
- xticks=[(i + 0.25) * 48 for i in range(4)],
- xticklabels=["Tue", "Wed", "Thu", "Fri"],
+ xticks=[(i + 0.2) * 48 for i in range(7)],
+ xticklabels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"],
xlabel="Time of the week",
ylabel="Normalized energy transfer",
)
-_ = ax.legend()
+_ = ax.legend(loc="lower right")
# %%
# Keep in mind that those predicted percentiles are just estimations from a
@@ -387,6 +368,10 @@
# Indeed, we can verify that the predictive quality of the model is not
# significantly degraded by introducing the monotonic constraints:
+from sklearn.model_selection import TimeSeriesSplit, cross_validate
+
+ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336)
+
cv_results = cross_validate(
hgbt_no_cst,
X,
From 5cf52c27b3a328a0e65638036e9bd5db470472a7 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Mon, 13 Nov 2023 16:44:53 +0100
Subject: [PATCH 30/52] Reference the problem of coverage
---
examples/ensemble/plot_gradient_boosting_quantile.py | 1 +
examples/ensemble/plot_hgbt_regression.py | 3 +++
2 files changed, 4 insertions(+)
diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
index 41378db704600..a01f0d2d1e8b6 100644
--- a/examples/ensemble/plot_gradient_boosting_quantile.py
+++ b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -192,6 +192,7 @@ def highlight_min(x):
# (underestimation for this asymmetric noise) but is also naturally robust to
# outliers and overfits less.
#
+# .. _calibration-section:
# Calibration of the confidence interval
# --------------------------------------
#
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 607041397e38f..66ebc598c40b5 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -284,6 +284,9 @@ def generate_missing_values(X, missing_fraction):
_ = ax.legend(loc="lower right")
# %%
+# We observe a tendence to over-estimate the energy transfer. This could be be
+# quantitatively confirmed by computing empirical coverage numbers as done in
+# the :ref:`calibration of confidence intervals section `.
# Keep in mind that those predicted percentiles are just estimations from a
# model. One can still improve the quality of such estimations by:
#
From 214a0838c2ae182b351dd1f9a25791a41ec9babe Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Mon, 13 Nov 2023 16:46:33 +0100
Subject: [PATCH 31/52] Fix typo
---
examples/ensemble/plot_hgbt_regression.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 66ebc598c40b5..19f14cc3551f9 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -396,6 +396,6 @@ def generate_missing_values(X, missing_fraction):
print(f"RMSE with constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}")
# %%
-# That being said, notice the comparison is between to different models that may
+# That being said, notice the comparison is between two different models that may
# be optimized by a different combination of hyperparameters. That is the reason
# why we do no use the `common_params` in this section as done before.
From 64ff62960b53b36aea06a831ff76258b62258d94 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Tue, 14 Nov 2023 15:45:34 +0100
Subject: [PATCH 32/52] Apply suggestions from code review
Co-authored-by: Guillaume Lemaitre
---
examples/ensemble/plot_hgbt_regression.py | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 19f14cc3551f9..fc8b07cce518a 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -14,11 +14,10 @@
The top usability features of HGBT models are:
1. :ref:`categorical_support_gbdt` (see
- :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
+ :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
1. Early stopping.
1. :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-1. Support for several losses such as the :ref:`Quantile loss
- `.
+1. Support for several losses such as the :ref:`Quantile loss `.
1. :ref:`monotonic_cst_gbdt`.
1. :ref:`interaction_cst_hgbt`.
@@ -59,7 +58,7 @@
# This particular dataset has a stepwise constant target for the first 17,760
# samples:
-y[:17760].unique()
+y[:17_760].unique()
# %%
# Let us drop those entries and explore the hourly electricity transfer over
@@ -192,7 +191,7 @@
# The present example shows how HGBT regressions deal with values missing
# completely at random (MCAR), i.e. the missingness does not depend on the
# observed data or the unobserved data. We can simulate such scenario by
-# randomly replacing values from randomly selected features with `Nan` values.
+# randomly replacing values from randomly selected features with `nan` values.
import numpy as np
From 604283e0e7b1e6dd10a0db3376f0229f8a23def7 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Tue, 14 Nov 2023 17:13:20 +0100
Subject: [PATCH 33/52] Prefer ax instead of plt to plot
---
examples/ensemble/plot_hgbt_regression.py | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index fc8b07cce518a..1354751f6ba83 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -67,7 +67,7 @@
import matplotlib.pyplot as plt
import seaborn as sns
-df = electricity.frame.iloc[17760:]
+df = electricity.frame.iloc[17_760:]
X = df.drop(columns=["transfer", "class"])
y = df["transfer"]
@@ -160,10 +160,15 @@
hgbt = HistGradientBoostingRegressor(early_stopping=True, **common_params)
hgbt.fit(X_train, y_train)
+
+_, ax = plt.subplots()
plt.plot(-hgbt.validation_score_)
-plt.xlabel("number of iterations")
-plt.ylabel("root mean squared error")
-_ = plt.title(f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})")
+ax.set(
+ xlabel="number of iterations",
+ ylabel="root mean squared error",
+ title=f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})",
+)
+_ = ax.legend()
# %%
# We can then overwrite the value for `max_iter` to a reasonable value and avoid
From 11d165c614a5fd34b8854c91e4cc94f5de44fe3e Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Tue, 14 Nov 2023 17:31:12 +0100
Subject: [PATCH 34/52] Add brief interpretation of plot
---
examples/ensemble/plot_hgbt_regression.py | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 1354751f6ba83..54d5bf35c9f07 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -163,12 +163,11 @@
_, ax = plt.subplots()
plt.plot(-hgbt.validation_score_)
-ax.set(
+_ = ax.set(
xlabel="number of iterations",
ylabel="root mean squared error",
title=f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})",
)
-_ = ax.legend()
# %%
# We can then overwrite the value for `max_iter` to a reasonable value and avoid
@@ -204,7 +203,7 @@
rng = np.random.RandomState(42)
first_week = slice(0, 336) # first week in the test set as 7 * 48 = 336
-missing_fraction_list = [0, 0.02, 0.05]
+missing_fraction_list = [0, 0.01, 0.03]
def generate_missing_values(X, missing_fraction):
@@ -221,9 +220,10 @@ def generate_missing_values(X, missing_fraction):
ax.plot(y_test.values[first_week], label="Actual transfer")
for missing_fraction in missing_fraction_list:
- X_missing = generate_missing_values(X_train, missing_fraction)
- hgbt.fit(X_missing, y_train)
- y_pred = hgbt.predict(X_test[first_week])
+ X_train_missing = generate_missing_values(X_train, missing_fraction)
+ X_test_missing = generate_missing_values(X_test, missing_fraction)
+ hgbt.fit(X_train_missing, y_train)
+ y_pred = hgbt.predict(X_test_missing[first_week])
rmse = root_mean_squared_error(y_test[first_week], y_pred)
ax.plot(
y_pred[first_week],
@@ -240,6 +240,8 @@ def generate_missing_values(X, missing_fraction):
_ = ax.legend(loc="lower right")
# %%
+# As expected, the model degrades as the proportion of missing values increases.
+#
# Support for quantile loss
# =========================
#
From 3abb0c4658159a0ec2746b2a74dc54ac38d1f73b Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Tue, 21 Nov 2023 17:19:46 +0100
Subject: [PATCH 35/52] Revert use of numbered list
---
examples/ensemble/plot_hgbt_regression.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 54d5bf35c9f07..dd69757b5ed35 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -13,13 +13,13 @@
The top usability features of HGBT models are:
-1. :ref:`categorical_support_gbdt` (see
- :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
-1. Early stopping.
-1. :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-1. Support for several losses such as the :ref:`Quantile loss `.
-1. :ref:`monotonic_cst_gbdt`.
-1. :ref:`interaction_cst_hgbt`.
+- :ref:`categorical_support_gbdt` (see
+ :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
+- Early stopping.
+- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
+- Support for several losses such as the :ref:`Quantile loss `.
+- :ref:`monotonic_cst_gbdt`.
+- :ref:`interaction_cst_hgbt`.
This example aims at showcasing points 2-5 in a real life setting.
"""
From 7c8406820dfaeeb11f39dae1b2226860d183e78a Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Sat, 6 Jan 2024 20:22:17 +0100
Subject: [PATCH 36/52] Apply suggestions from code review
Co-authored-by: Christian Lorentzen
---
examples/ensemble/plot_hgbt_regression.py | 58 +++++++++++------------
1 file changed, 29 insertions(+), 29 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index dd69757b5ed35..af43d8ee2ae24 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -1,27 +1,27 @@
"""
-=============================================================
-Usecase of advanced features in Histogram Boosting Regression
-=============================================================
+===================================================================
+Use cases of advanced features in Histogram Gradient Boosting Trees
+===================================================================
-:ref:`histogram_based_gradient_boosting` (HGBT) models may be the most useful
+:ref:`histogram_based_gradient_boosting` (HGBT) models may be one of the most useful
supervised learning models in scikit-learn. They are based on a modern gradient
boosting implementation comparable to LightGBM and XGBoost. As such, HGBT models
-are more feature rich than -and often outperforms- alternative models like
-random forests, especially when the number of samples is larger than tens of
-thousands of samples (see
+are more feature rich than and often outperform alternative models like
+random forests, especially when the number of samples is larger than some ten
+thousands (see
:ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
The top usability features of HGBT models are:
-- :ref:`categorical_support_gbdt` (see
+1. Several available loss function for mean and quantile regression tasks, see :ref:`Quantile loss `.
+2. :ref:`categorical_support_gbdt` (see
:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
-- Early stopping.
-- :ref:`nan_support_hgbt`, which avoids the need for an imputer.
-- Support for several losses such as the :ref:`Quantile loss `.
-- :ref:`monotonic_cst_gbdt`.
-- :ref:`interaction_cst_hgbt`.
+3. Early stopping.
+4. :ref:`nan_support_hgbt`, which avoids the need for an imputer.
+5. :ref:`monotonic_cst_gbdt`.
+6. :ref:`interaction_cst_hgbt`.
-This example aims at showcasing points 2-5 in a real life setting.
+This example aims at showcasing all points except 2 and 6 in a real life setting.
"""
# Author: Arturo Amor
@@ -41,8 +41,8 @@
# 30 minutes, i.e. there are 48 instances for each time period of one day. Each
# sample on the dataset has 5 fields: the day of week, the time stamp, the New
# South Wales electricity demand, the Victoria electricity demand. It is
-# originally a classification task, but here we use it as a regression where the
-# target is the scheduled electricity transfer between states.
+# originally a classification task, but here we use it for the regression task
+# to predict the scheduled electricity transfer between states.
from sklearn.datasets import fetch_openml
@@ -104,7 +104,7 @@
colors = sns.color_palette("colorblind")
fig, ax = plt.subplots(figsize=(10, 5))
-average_week_demand.plot(color=colors[0], label="ground truth", linewidth=2, ax=ax)
+average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax)
for idx, max_iter in enumerate(max_iter_list):
hgbt = HistGradientBoostingRegressor(max_iter=max_iter)
@@ -131,19 +131,19 @@
# %%
# With just a few iterations, HGBT models can achieve convergence (see
-# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
+# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`),
+# meaning that adding more trees does not improve the model anymore.
#
-# Instead of relying solely on `max_iter` to determine when to stop, the HGBT
-# implementations in scikit-learn support early stopping. With it, the model
-# uses a fraction of the training data as a validation set
+# Instead of relying on `max_iter` alone to determine when to stop, the HGBT
+# implementation in scikit-learn supports early stopping. With it, the model
+# uses a fraction of the training data as internal validation set
# (`validation_fraction`) and stops training if the validation score does not
# improve (or degrades) after `n_iter_no_change` iterations up to a certain
# `tol`.
#
# Notice that there is a trade-off between `learning_rate` and `max_iter`:
-# Generally, smaller learning rates require more iterations to converge to the
-# minimum loss, while larger learning rates might converge faster but are at
-# risk of overfitting.
+# Generally, smaller learning rates are preferable but require more iterations to converge to the
+# minimum loss, while larger learning rates converge faster (less iterations/trees needed) but at the cost of a larger minimum loss.
#
# Indeed, a good practice is to tune the learning rate along with any other
# hyperparameters, fit the HBGT on the training set with a large enough value
@@ -181,7 +181,7 @@
# %%
# .. note:: The inner validation done during early stopping is not optimal for
-# time series with the implementation as of scikit-learn v1.3.
+# time series.
#
# Support for missing values
# ==========================
@@ -227,7 +227,7 @@ def generate_missing_values(X, missing_fraction):
rmse = root_mean_squared_error(y_test[first_week], y_pred)
ax.plot(
y_pred[first_week],
- label=f"missing_fraction={missing_fraction}, RMSE={rmse:.2f}",
+ label=f"missing_fraction={missing_fraction}, RMSE={rmse:.3f}",
alpha=0.5,
)
ax.set(
@@ -248,7 +248,7 @@ def generate_missing_values(X, missing_fraction):
# The quantile loss in regression enables a view of the variability or
# uncertainty of the target variable. For instance, predicting the 5th and 95th
# percentiles can provide a 90% prediction interval, i.e. the range within which
-# we expect the true value to fall with 90% probability.
+# we expect a new observed value to fall with 90% probability.
from sklearn.metrics import mean_pinball_loss
@@ -389,7 +389,7 @@ def generate_missing_values(X, missing_fraction):
scoring="neg_root_mean_squared_error",
)
rmse = -cv_results["test_score"]
-print(f"RMSE without constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}")
+print(f"RMSE without constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}")
cv_results = cross_validate(
hgbt_cst,
@@ -399,7 +399,7 @@ def generate_missing_values(X, missing_fraction):
scoring="neg_root_mean_squared_error",
)
rmse = -cv_results["test_score"]
-print(f"RMSE with constraints = {rmse.mean():.2f} +/- {rmse.std():.2f}")
+print(f"RMSE with constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}")
# %%
# That being said, notice the comparison is between two different models that may
From dcdf851d959884454739573dd2becc24dd9793ce Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Sat, 6 Jan 2024 20:48:04 +0100
Subject: [PATCH 37/52] Lint
---
examples/ensemble/plot_hgbt_regression.py | 29 +++++++++++++----------
1 file changed, 16 insertions(+), 13 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index af43d8ee2ae24..5be4014bcb95f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -3,17 +3,18 @@
Use cases of advanced features in Histogram Gradient Boosting Trees
===================================================================
-:ref:`histogram_based_gradient_boosting` (HGBT) models may be one of the most useful
-supervised learning models in scikit-learn. They are based on a modern gradient
-boosting implementation comparable to LightGBM and XGBoost. As such, HGBT models
-are more feature rich than and often outperform alternative models like
-random forests, especially when the number of samples is larger than some ten
-thousands (see
+:ref:`histogram_based_gradient_boosting` (HGBT) models may be one of the most
+useful supervised learning models in scikit-learn. They are based on a modern
+gradient boosting implementation comparable to LightGBM and XGBoost. As such,
+HGBT models are more feature rich than and often outperform alternative models
+like random forests, especially when the number of samples is larger than some
+ten thousands (see
:ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
The top usability features of HGBT models are:
-1. Several available loss function for mean and quantile regression tasks, see :ref:`Quantile loss `.
+1. Several available loss function for mean and quantile regression tasks, see
+ :ref:`Quantile loss `.
2. :ref:`categorical_support_gbdt` (see
:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
3. Early stopping.
@@ -21,7 +22,8 @@
5. :ref:`monotonic_cst_gbdt`.
6. :ref:`interaction_cst_hgbt`.
-This example aims at showcasing all points except 2 and 6 in a real life setting.
+This example aims at showcasing all points except 2 and 6 in a real life
+setting.
"""
# Author: Arturo Amor
@@ -142,8 +144,9 @@
# `tol`.
#
# Notice that there is a trade-off between `learning_rate` and `max_iter`:
-# Generally, smaller learning rates are preferable but require more iterations to converge to the
-# minimum loss, while larger learning rates converge faster (less iterations/trees needed) but at the cost of a larger minimum loss.
+# Generally, smaller learning rates are preferable but require more iterations
+# to converge to the minimum loss, while larger learning rates converge faster
+# (less iterations/trees needed) but at the cost of a larger minimum loss.
#
# Indeed, a good practice is to tune the learning rate along with any other
# hyperparameters, fit the HBGT on the training set with a large enough value
@@ -402,6 +405,6 @@ def generate_missing_values(X, missing_fraction):
print(f"RMSE with constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}")
# %%
-# That being said, notice the comparison is between two different models that may
-# be optimized by a different combination of hyperparameters. That is the reason
-# why we do no use the `common_params` in this section as done before.
+# That being said, notice the comparison is between two different models that
+# may be optimized by a different combination of hyperparameters. That is the
+# reason why we do no use the `common_params` in this section as done before.
From ab0e21a44d4fbe87ab331d30061d0ebd9455abf5 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Tue, 16 Jan 2024 18:12:01 +0100
Subject: [PATCH 38/52] Fix FutureWarning
---
examples/ensemble/plot_hgbt_regression.py | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 5be4014bcb95f..65448b23d9681 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -109,7 +109,7 @@
average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax)
for idx, max_iter in enumerate(max_iter_list):
- hgbt = HistGradientBoostingRegressor(max_iter=max_iter)
+ hgbt = HistGradientBoostingRegressor(max_iter=max_iter, categorical_features=None)
hgbt.fit(X_train, y_train)
y_pred = hgbt.predict(X_test)
@@ -158,6 +158,7 @@
"learning_rate": 0.3,
"validation_fraction": 0.2,
"random_state": 42,
+ "categorical_features": None,
"scoring": "neg_root_mean_squared_error",
}
@@ -342,8 +343,10 @@ def generate_missing_values(X, missing_fraction):
"vicdemand": -1,
"vicprice": -1,
}
-hgbt_no_cst = HistGradientBoostingRegressor().fit(X, y)
-hgbt_cst = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst).fit(X, y)
+hgbt_no_cst = HistGradientBoostingRegressor(categorical_features=None).fit(X, y)
+hgbt_cst = HistGradientBoostingRegressor(
+ monotonic_cst=monotonic_cst, categorical_features=None
+).fit(X, y)
fig, ax = plt.subplots(nrows=2, figsize=(15, 10))
disp = PartialDependenceDisplay.from_estimator(
From c4d1b3b7938ba576eda7e860ee362a34ef9cae8b Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Tue, 16 Jan 2024 18:26:37 +0100
Subject: [PATCH 39/52] List of features as suggested by Christian
---
examples/ensemble/plot_hgbt_regression.py | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 65448b23d9681..ca2a68f018f01 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -41,10 +41,15 @@
# The dataset (originally named ELEC2) contains 45,312 instances dated from 7
# May 1996 to 5 December 1998. Each sample of the dataset refers to a period of
# 30 minutes, i.e. there are 48 instances for each time period of one day. Each
-# sample on the dataset has 5 fields: the day of week, the time stamp, the New
-# South Wales electricity demand, the Victoria electricity demand. It is
-# originally a classification task, but here we use it for the regression task
-# to predict the scheduled electricity transfer between states.
+# sample on the dataset has 7 columns:
+# - date: between 7 May 1996 to 5 December 1998. Normalized between 0 and 1;
+# - day: day of week (1-7);
+# - period: half hour intervals over 24 hours. Normalized between 0 and 1;
+# - nswprice/nswdemand: electricity price/demand of New South Wales;
+# - vicprice/vicdemand: electricity price/demand of Victoria.
+#
+# It is originally a classification task, but here we use it for the regression
+# task to predict the scheduled electricity transfer between states.
from sklearn.datasets import fetch_openml
From 49587ab0b826b12acb1e8717f78b2d9d51c50ffa Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Wed, 17 Jan 2024 11:22:02 +0100
Subject: [PATCH 40/52] Simplify code
Co-authored-by: Christian Lorentzen
---
examples/ensemble/plot_hgbt_regression.py | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index ca2a68f018f01..230c1a20b62ab 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -57,15 +57,12 @@
name="electricity", version=1, as_frame=True, parser="pandas"
)
df = electricity.frame
-X = df.drop(columns=["transfer", "class"])
-y = df["transfer"]
-X
# %%
# This particular dataset has a stepwise constant target for the first 17,760
# samples:
-y[:17_760].unique()
+df["transfer"][:17_760,].unique()
# %%
# Let us drop those entries and explore the hourly electricity transfer over
From 42c17427acccf6ffc67fc6d95d49566c3f5f48f3 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 17 Jan 2024 11:50:02 +0100
Subject: [PATCH 41/52] Print simple stats
---
examples/ensemble/plot_hgbt_regression.py | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 230c1a20b62ab..67b15d8be360f 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -62,7 +62,7 @@
# This particular dataset has a stepwise constant target for the first 17,760
# samples:
-df["transfer"][:17_760,].unique()
+df["transfer"][:17_760].unique()
# %%
# Let us drop those entries and explore the hourly electricity transfer over
@@ -100,12 +100,17 @@
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
-max_iter_list = [5, 50]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
+
+print(f"Training sample size: {X_train.shape[0]}")
+print(f"Test sample size: {X_test.shape[0]}")
+print(f"Number of features: {X_train.shape[1]}")
+
+# %%
+max_iter_list = [5, 50]
average_week_demand = (
df.loc[X_test.index].groupby(["day", "period"], observed=False)["transfer"].mean()
)
-
colors = sns.color_palette("colorblind")
fig, ax = plt.subplots(figsize=(10, 5))
average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax)
From 37bb831e558c8f16bd1b0557f641340e7cd77c6c Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 17 Jan 2024 15:10:03 +0100
Subject: [PATCH 42/52] Fix indentation
---
examples/ensemble/plot_hgbt_regression.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 67b15d8be360f..adce5c6b36e1a 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -16,7 +16,7 @@
1. Several available loss function for mean and quantile regression tasks, see
:ref:`Quantile loss `.
2. :ref:`categorical_support_gbdt` (see
- :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
+ :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`).
3. Early stopping.
4. :ref:`nan_support_hgbt`, which avoids the need for an imputer.
5. :ref:`monotonic_cst_gbdt`.
From d1b809a5e8ba56c4dd5a7126039b935818446c55 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 17 Jan 2024 15:10:57 +0100
Subject: [PATCH 43/52] Use programmatic way to round up n_iter
---
examples/ensemble/plot_hgbt_regression.py | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index adce5c6b36e1a..92efcb53e365a 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -182,11 +182,12 @@
# %%
# We can then overwrite the value for `max_iter` to a reasonable value and avoid
-# the extra computational cost of the inner validation. In this case, rounding
-# up the number of iterations to 400 may account for variability of the training
-# set:
+# the extra computational cost of the inner validation. Rounding up the number
+# of iterations may account for variability of the training set:
-common_params["max_iter"] = 400
+import math
+
+common_params["max_iter"] = math.ceil(hgbt.n_iter_ / 100) * 100
common_params["early_stopping"] = False
hgbt = HistGradientBoostingRegressor(**common_params)
From 5b1875528d51194a450029e5782896b4058c47d5 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 17 Jan 2024 15:11:25 +0100
Subject: [PATCH 44/52] Set random state for deterministic results
---
examples/ensemble/plot_hgbt_regression.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 92efcb53e365a..b6825660a8b56 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -351,9 +351,11 @@ def generate_missing_values(X, missing_fraction):
"vicdemand": -1,
"vicprice": -1,
}
-hgbt_no_cst = HistGradientBoostingRegressor(categorical_features=None).fit(X, y)
+hgbt_no_cst = HistGradientBoostingRegressor(
+ categorical_features=None, random_state=42
+).fit(X, y)
hgbt_cst = HistGradientBoostingRegressor(
- monotonic_cst=monotonic_cst, categorical_features=None
+ monotonic_cst=monotonic_cst, categorical_features=None, random_state=42
).fit(X, y)
fig, ax = plt.subplots(nrows=2, figsize=(15, 10))
From 9499e611a7b2159c0685a0195cdaad37902f55ac Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 17 Jan 2024 15:24:09 +0100
Subject: [PATCH 45/52] Add explanation on time-aware cross validation
---
examples/ensemble/plot_hgbt_regression.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index b6825660a8b56..71a1109db7b84 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -391,11 +391,15 @@ def generate_missing_values(X, missing_fraction):
# %%
# Indeed, we can verify that the predictive quality of the model is not
-# significantly degraded by introducing the monotonic constraints:
+# significantly degraded by introducing the monotonic constraints. For such
+# purpose we use :class:`~sklearn.model_selection.TimeSeriesSplit`
+# cross-validation to estimate the variance of the test score. By doing so we
+# guarantee that the training data does not succeed the testing data, which is
+# crucial when dealing with data that have a temporal relationship.
from sklearn.model_selection import TimeSeriesSplit, cross_validate
-ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336)
+ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336) # a week has 336 samples
cv_results = cross_validate(
hgbt_no_cst,
From 3b1789e4547db60f0f05a6483e3ad268e4037285 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Wed, 17 Jan 2024 15:28:07 +0100
Subject: [PATCH 46/52] Add comment on overcronstraining feature
---
examples/ensemble/plot_hgbt_regression.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 71a1109db7b84..8a6e7d4128cd0 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -390,7 +390,10 @@ def generate_missing_values(X, missing_fraction):
_ = plt.legend()
# %%
-# Indeed, we can verify that the predictive quality of the model is not
+# Observe that `nswdemand` seems already monotonic without constraint. This is a
+# good example to show that the model is "overconstraining".
+#
+# Additionally, we can verify that the predictive quality of the model is not
# significantly degraded by introducing the monotonic constraints. For such
# purpose we use :class:`~sklearn.model_selection.TimeSeriesSplit`
# cross-validation to estimate the variance of the test score. By doing so we
From d972fae3ec6653ef14aa121f6e995503f6fec738 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Tue, 23 Jan 2024 10:50:10 +0100
Subject: [PATCH 47/52] Apply suggestion from Guillaume
Co-authored-by: Guillaume Lemaitre
---
examples/ensemble/plot_hgbt_regression.py | 22 ++++++----------------
1 file changed, 6 insertions(+), 16 deletions(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 8a6e7d4128cd0..26531bc8fd9a6 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -400,28 +400,18 @@ def generate_missing_values(X, missing_fraction):
# guarantee that the training data does not succeed the testing data, which is
# crucial when dealing with data that have a temporal relationship.
+from sklearn.metrics import make_scorer, root_mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, cross_validate
ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336) # a week has 336 samples
+scorer = make_scorer(root_mean_squared_error)
-cv_results = cross_validate(
- hgbt_no_cst,
- X,
- y,
- cv=ts_cv,
- scoring="neg_root_mean_squared_error",
-)
-rmse = -cv_results["test_score"]
+cv_results = cross_validate(hgbt_no_cst, X, y, cv=ts_cv, scoring=scorer)
+rmse = cv_results["test_score"]
print(f"RMSE without constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}")
-cv_results = cross_validate(
- hgbt_cst,
- X,
- y,
- cv=ts_cv,
- scoring="neg_root_mean_squared_error",
-)
-rmse = -cv_results["test_score"]
+cv_results = cross_validate(hgbt_cst, X, y, cv=ts_cv, scoring=scorer)
+rmse = cv_results["test_score"]
print(f"RMSE with constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}")
# %%
From 9f49ad5affa408b503067018782522019b9c4f9e Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Tue, 23 Jan 2024 10:55:27 +0100
Subject: [PATCH 48/52] Update examples/ensemble/plot_adaboost_regression.py
Co-authored-by: Christian Lorentzen
---
examples/ensemble/plot_adaboost_regression.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py
index 98d3699ab161c..916d17addff18 100644
--- a/examples/ensemble/plot_adaboost_regression.py
+++ b/examples/ensemble/plot_adaboost_regression.py
@@ -10,7 +10,7 @@
detail.
See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an
-example showcasing the benefits of using more robust regressions such as
+example showcasing the benefits of using more efficient regression models such as
:class:`~ensemble.HistGradientBoostingRegressor`.
.. [1] `H. Drucker, "Improving Regressors using Boosting Techniques", 1997.
From d333c6d8dd276e935fe7ce96d68e17e809dc4d14 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Tue, 23 Jan 2024 10:56:10 +0100
Subject: [PATCH 49/52] Format
---
examples/ensemble/plot_adaboost_regression.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py
index 916d17addff18..8ba01df63b561 100644
--- a/examples/ensemble/plot_adaboost_regression.py
+++ b/examples/ensemble/plot_adaboost_regression.py
@@ -10,8 +10,8 @@
detail.
See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for an
-example showcasing the benefits of using more efficient regression models such as
-:class:`~ensemble.HistGradientBoostingRegressor`.
+example showcasing the benefits of using more efficient regression models such
+as :class:`~ensemble.HistGradientBoostingRegressor`.
.. [1] `H. Drucker, "Improving Regressors using Boosting Techniques", 1997.
`_
From c4a79e64f88e47e708e81f117f55af70deb2010c Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Mon, 19 Feb 2024 14:46:20 +0100
Subject: [PATCH 50/52] Update examples/ensemble/plot_hgbt_regression.py
Co-authored-by: Guillaume Lemaitre
---
examples/ensemble/plot_hgbt_regression.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 26531bc8fd9a6..796d2d17a76b2 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -325,7 +325,7 @@ def generate_missing_values(X, missing_fraction):
# bias. Monotonic constraints can also be used to enforce specific regulatory
# requirements, ensure compliance and align with ethical considerations.
#
-# In the present example, the policy of transfering energy from Victoria to New
+# In the present example, the policy of transferring energy from Victoria to New
# South Wales is meant to alleviate price fluctuations, meaning that the model
# predictions have to enforce such goal, i.e. transfer should increase with
# price and demand in New South Wales, but also decrease with price and demand
From 1010eccf26314fc7f51e4f217a93c0efa2f60e12 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Mon, 19 Feb 2024 14:54:30 +0100
Subject: [PATCH 51/52] Fix random_state
---
examples/ensemble/plot_hgbt_regression.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 26531bc8fd9a6..38cad137e35b0 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -116,7 +116,9 @@
average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax)
for idx, max_iter in enumerate(max_iter_list):
- hgbt = HistGradientBoostingRegressor(max_iter=max_iter, categorical_features=None)
+ hgbt = HistGradientBoostingRegressor(
+ max_iter=max_iter, categorical_features=None, random_state=42
+ )
hgbt.fit(X_train, y_train)
y_pred = hgbt.predict(X_test)
From 31db489ec87041b3974528eea3264cdf569562c6 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ
Date: Mon, 19 Feb 2024 14:54:42 +0100
Subject: [PATCH 52/52] Wording as suggested by Guillaume
---
examples/ensemble/plot_hgbt_regression.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/examples/ensemble/plot_hgbt_regression.py b/examples/ensemble/plot_hgbt_regression.py
index 38cad137e35b0..3d18064e7e489 100644
--- a/examples/ensemble/plot_hgbt_regression.py
+++ b/examples/ensemble/plot_hgbt_regression.py
@@ -143,7 +143,9 @@
# %%
# With just a few iterations, HGBT models can achieve convergence (see
# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`),
-# meaning that adding more trees does not improve the model anymore.
+# meaning that adding more trees does not improve the model anymore. In the
+# figure above, 5 iterations are not enough to be able to predict. With 50
+# iterations, we are already able to do a good job.
#
# Instead of relying on `max_iter` alone to determine when to stop, the HGBT
# implementation in scikit-learn supports early stopping. With it, the model