From 971478b1f7209aa8e265a64b2a894ed6b7041a40 Mon Sep 17 00:00:00 2001 From: virchan Date: Mon, 7 Oct 2024 11:44:55 -0700 Subject: [PATCH 01/13] DOC rework the example presenting the regularization path of Lasso, Lasso-LARS, and Elastic Net --- .../plot_lasso_lasso_lars_elasticnet_path.py | 164 +++++++++++++----- 1 file changed, 116 insertions(+), 48 deletions(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index 44ae64c4c2811..3a8292b439dd7 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -4,31 +4,104 @@ ======================================== This example shows how to compute the "paths" of coefficients along the Lasso, -Lasso-LARS, and Elastic Net regularization paths. In other words, it shows the -relationship between the regularization parameter (alpha) and the coefficients. +Lasso-LARS, and Elastic Net regularization paths. It illustrates the +relationship between the regularization parameter :math:`\\alpha` +and the coefficients :math:`w`. + +When performing linear regression on a given dataset +:math:`(X, y)`, regularization terms can be added to +control the model's complexity. +Scikit-learn provides the following regularization techniques: + +- :func:`~sklearn.linear_model.Lasso` +- :func:`~sklearn.linear_model.LassoLars` +- :func:`~sklearn.linear_model.ElasticNet` with default `l1_ratio=0.5` + +Mathematically, these are formulated by minimising the constrained +least-squares penalty: + +.. math:: + + \\min_{w} \\frac{1}{2n_{\\operatorname{sample}}} + \\vert \\vert Xw - y \\vert \\vert^2_2 + + \\left\\{ + \\begin{array}{cl} + \\alpha \\vert \\vert w \\vert \\vert_1 & \\mbox{Lasso(-LARS)} \\\\ + \\frac{\\alpha}{2} \\vert \\vert w \\vert \\vert_1 + + \\frac{\\alpha}{4} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\ + \\end{array} + \\right. + +Thus, the Lasso model includes the :math:`\\ell^1`-norm of the regression +coefficients in the penalty, while the Elastic Net model +incorporates both :math:`\\ell^1`- and :math:`\\ell^2`-norms. + +Any solution to this optimisation problem depends on :math:`\\alpha`. +For example, in Lasso, a large :math:`\\alpha` forces the least-squares +penalty to stay small, which in turn keeps the norm +:math:`\\vert \\vert w \\vert \\vert_1` +small. Conversely, a smaller :math:`\\alpha` allows the norm +:math:`\\vert \\vert w \\vert \\vert_1` +to grow larger. + +This suggests that the regression coefficients :math:`w` evolve as +:math:`\\alpha` increases, and we are interested in knowing +:math:`w` across a range of :math:`\\alpha` values. This is known +as the **regularization path**: a list of :math:`w` values corresponding to +different :math:`\\alpha` values, ranging from small to large. +In this example, we plot the regularization paths to show how the sizes of the +coefficients change as the regularization parameter increases. + +We use the diabetes dataset :func:`~sklearn.datasets.load_diabetes` to plot +the regression coefficients for Lasso and Elastic Net. Scikit-learn provides the +following functions to compute multiple :math:`w` values for various +:math:`\\alpha` values efficiently: + +- :func:`~sklearn.linear_model.lasso_path` +- :func:`~sklearn.linear_model.lars_path` +- :func:`~sklearn.linear_model.enet_path` + +The :func:`~sklearn.linear_model.lasso_path` and +:func:`~sklearn.linear_model.enet_path` functions compute +:math:`w` with coordinate decent: for each entry of :math:`w`, +the function solves for it optimal value while keeping the others +fixed. Since the algorithm iterates until convergence, +Lasso doesn't operate in a fixed number of steps based solely +on the dataset's size, which can make it take longer to run. +In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps. + +The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm +(see [1]_) to compute the Lasso solution in +:math:`\\min \\left\\{ +n_{\\operatorname{sample}}-1,n_{\\operatorname{feature}} +\\right\\}` +steps. This provides an efficient algorithm for computing the entire Lasso path, and +is implemented as :func:`~sklearn.linear_model.LassoLars` +and :func:`~sklearn.linear_model.lars_path`. + +We now present the visualisation of the regularization path for the diabetes dataset. +Each model is represented by 10 curves, corresponding to the number of features in the +dataset. Each curve shows how a particular coefficient :math:`w_i` changes as +:math:`\\alpha` increases. + +- In the "Lasso vs LARS Paths" visual, the Lasso and LARS paths appear identical because + both models solve the same constrained problem. However, LARS reaches the solution + faster than Lasso. + +- The "Lasso vs Elastic-Net Paths" visual is more notable. Elastic Net's coefficients + tend to have smaller absolute values than those of Lasso. Additionally, Elastic Net + maintains more non-zero coefficients than Lasso towards the end. This demonstrates + how the :math:`\\ell^1`-norm constraint encourages sparsity in the solution, while + combining it with the :math:`\\ell^2`-norm provides a balanced compromise. + + + +.. rubric:: References + +.. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R., + Friedman J., Algorithm 3.2, p. 74, 2008. -Lasso and Lasso-LARS impose a sparsity constraint on the coefficients, -encouraging some of them to be zero. Elastic Net is a generalization of -Lasso that adds an L2 penalty term to the L1 penalty term. This allows for -some coefficients to be non-zero while still encouraging sparsity. -Lasso and Elastic Net use a coordinate descent method to compute the paths, while -Lasso-LARS uses the LARS algorithm to compute the paths. - -The paths are computed using :func:`~sklearn.linear_model.lasso_path`, -:func:`~sklearn.linear_model.lars_path`, and :func:`~sklearn.linear_model.enet_path`. - -The results show different comparison plots: - -- Compare Lasso and Lasso-LARS -- Compare Lasso and Elastic Net -- Compare Lasso with positive Lasso -- Compare LARS and Positive LARS -- Compare Elastic Net and positive Elastic Net - -Each plot shows how the model coefficients vary as the regularization strength changes, -offering insight into the behavior of these models -under different constraints. """ # Authors: The scikit-learn developers @@ -75,62 +148,57 @@ # Display results plt.figure(1) -colors = cycle(["b", "r", "g", "c", "k"]) -for coef_lasso, coef_lars, c in zip(coefs_lasso, coefs_lars, colors): - l1 = plt.semilogx(alphas_lasso, coef_lasso, c=c) - l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c=c) +for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars): + l1 = plt.semilogx(alphas_lasso, coef_lasso, c='#0072B2') + l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c='#D55E00') plt.xlabel("alpha") plt.ylabel("coefficients") -plt.title("Lasso and LARS Paths") +plt.title("Lasso vs LARS Paths") plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right") plt.axis("tight") plt.figure(2) -colors = cycle(["b", "r", "g", "c", "k"]) -for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors): - l1 = plt.semilogx(alphas_lasso, coef_l, c=c) - l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c=c) +for coef_l, coef_e in zip(coefs_lasso, coefs_enet): + l1 = plt.semilogx(alphas_lasso, coef_l, c='#0072B2') + l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c='#D55E00') plt.xlabel("alpha") plt.ylabel("coefficients") -plt.title("Lasso and Elastic-Net Paths") +plt.title("Lasso vs Elastic-Net Paths") plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right") plt.axis("tight") - plt.figure(3) -for coef_l, coef_pl, c in zip(coefs_lasso, coefs_positive_lasso, colors): - l1 = plt.semilogy(alphas_lasso, coef_l, c=c) - l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c=c) +for coef_l, coef_pl in zip(coefs_lasso, coefs_positive_lasso): + l1 = plt.semilogy(alphas_lasso, coef_l, c='#0072B2') + l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c='#D55E00') plt.xlabel("alpha") plt.ylabel("coefficients") -plt.title("Lasso and positive Lasso") +plt.title("Lasso vs Positive Lasso Paths") plt.legend((l1[-1], l2[-1]), ("Lasso", "positive Lasso"), loc="lower right") plt.axis("tight") - plt.figure(4) -colors = cycle(["b", "r", "g", "c", "k"]) -for coef_lars, coef_positive_lars, c in zip(coefs_lars, coefs_positive_lars, colors): - l1 = plt.semilogx(alphas_lars, coef_lars, c=c) - l2 = plt.semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c=c) +for coef_lars, coef_positive_lars in zip(coefs_lars, coefs_positive_lars): + l1 = plt.semilogx(alphas_lars, coef_lars, c='#0072B2') + l2 = plt.semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c='#D55E00') plt.xlabel("alpha") plt.ylabel("coefficients") -plt.title("LARS and Positive LARS") +plt.title("LARS vs Positive LARS Paths") plt.legend((l1[-1], l2[-1]), ("LARS", "Positive LARS"), loc="lower right") plt.axis("tight") plt.figure(5) -for coef_e, coef_pe, c in zip(coefs_enet, coefs_positive_enet, colors): - l1 = plt.semilogx(alphas_enet, coef_e, c=c) - l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c=c) +for coef_e, coef_pe in zip(coefs_enet, coefs_positive_enet): + l1 = plt.semilogx(alphas_enet, coef_e, c='#0072B2') + l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c='#D55E00') plt.xlabel("alpha") plt.ylabel("coefficients") -plt.title("Elastic-Net and positive Elastic-Net") +plt.title("Elastic-Net vs Positive Elastic-Net Paths") plt.legend((l1[-1], l2[-1]), ("Elastic-Net", "positive Elastic-Net"), loc="lower right") plt.axis("tight") plt.show() From a24f1859a21a452c2f527a016171a203f4d0a3ca Mon Sep 17 00:00:00 2001 From: virchan Date: Mon, 7 Oct 2024 12:03:32 -0700 Subject: [PATCH 02/13] Fix Linting --- .../linear_model/plot_lasso_lasso_lars_elasticnet_path.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index 3a8292b439dd7..7296da8ef9a05 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -107,8 +107,6 @@ # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause -from itertools import cycle - import matplotlib.pyplot as plt from sklearn.datasets import load_diabetes @@ -183,7 +181,9 @@ plt.figure(4) for coef_lars, coef_positive_lars in zip(coefs_lars, coefs_positive_lars): l1 = plt.semilogx(alphas_lars, coef_lars, c='#0072B2') - l2 = plt.semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c='#D55E00') + l2 = plt.semilogx( + alphas_positive_lars, coef_positive_lars, linestyle="--", c='#D55E00' + ) plt.xlabel("alpha") plt.ylabel("coefficients") From edb9dba2bb5b63b8b2a21488b1bb81acbec35ee6 Mon Sep 17 00:00:00 2001 From: virchan Date: Mon, 7 Oct 2024 12:21:40 -0700 Subject: [PATCH 03/13] Fix Linting --- .../plot_lasso_lasso_lars_elasticnet_path.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index 7296da8ef9a05..ffea03ec22020 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -147,8 +147,8 @@ plt.figure(1) for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars): - l1 = plt.semilogx(alphas_lasso, coef_lasso, c='#0072B2') - l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c='#D55E00') + l1 = plt.semilogx(alphas_lasso, coef_lasso, c="#0072B2") + l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c="#D55E00") plt.xlabel("alpha") plt.ylabel("coefficients") @@ -158,8 +158,8 @@ plt.figure(2) for coef_l, coef_e in zip(coefs_lasso, coefs_enet): - l1 = plt.semilogx(alphas_lasso, coef_l, c='#0072B2') - l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c='#D55E00') + l1 = plt.semilogx(alphas_lasso, coef_l, c="#0072B2") + l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c="#D55E00") plt.xlabel("alpha") plt.ylabel("coefficients") @@ -169,8 +169,8 @@ plt.figure(3) for coef_l, coef_pl in zip(coefs_lasso, coefs_positive_lasso): - l1 = plt.semilogy(alphas_lasso, coef_l, c='#0072B2') - l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c='#D55E00') + l1 = plt.semilogy(alphas_lasso, coef_l, c="#0072B2") + l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c="#D55E00") plt.xlabel("alpha") plt.ylabel("coefficients") @@ -180,9 +180,9 @@ plt.figure(4) for coef_lars, coef_positive_lars in zip(coefs_lars, coefs_positive_lars): - l1 = plt.semilogx(alphas_lars, coef_lars, c='#0072B2') + l1 = plt.semilogx(alphas_lars, coef_lars, c="#0072B2") l2 = plt.semilogx( - alphas_positive_lars, coef_positive_lars, linestyle="--", c='#D55E00' + alphas_positive_lars, coef_positive_lars, linestyle="--", c="#D55E00" ) plt.xlabel("alpha") @@ -193,8 +193,8 @@ plt.figure(5) for coef_e, coef_pe in zip(coefs_enet, coefs_positive_enet): - l1 = plt.semilogx(alphas_enet, coef_e, c='#0072B2') - l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c='#D55E00') + l1 = plt.semilogx(alphas_enet, coef_e, c="#0072B2") + l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c="#D55E00") plt.xlabel("alpha") plt.ylabel("coefficients") From baba6173488a4ee0b55a3986c307ddd94616d7a4 Mon Sep 17 00:00:00 2001 From: virchan Date: Mon, 7 Oct 2024 12:40:13 -0700 Subject: [PATCH 04/13] Fix Linting --- .../linear_model/plot_lasso_lasso_lars_elasticnet_path.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index ffea03ec22020..32a086ee51811 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -1,3 +1,6 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + """ ======================================== Lasso, Lasso-LARS, and Elastic Net paths @@ -101,12 +104,8 @@ .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R., Friedman J., Algorithm 3.2, p. 74, 2008. - """ -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - import matplotlib.pyplot as plt from sklearn.datasets import load_diabetes From e697a16bad1c37343f8c052dab345b9b55cfa489 Mon Sep 17 00:00:00 2001 From: virchan Date: Mon, 7 Oct 2024 14:38:51 -0700 Subject: [PATCH 05/13] Implemented notebook style. --- .../plot_lasso_lasso_lars_elasticnet_path.py | 175 +++++++----------- 1 file changed, 68 insertions(+), 107 deletions(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index 32a086ee51811..d554eb4310675 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -54,95 +54,70 @@ different :math:`\\alpha` values, ranging from small to large. In this example, we plot the regularization paths to show how the sizes of the coefficients change as the regularization parameter increases. - -We use the diabetes dataset :func:`~sklearn.datasets.load_diabetes` to plot -the regression coefficients for Lasso and Elastic Net. Scikit-learn provides the -following functions to compute multiple :math:`w` values for various -:math:`\\alpha` values efficiently: - -- :func:`~sklearn.linear_model.lasso_path` -- :func:`~sklearn.linear_model.lars_path` -- :func:`~sklearn.linear_model.enet_path` - -The :func:`~sklearn.linear_model.lasso_path` and -:func:`~sklearn.linear_model.enet_path` functions compute -:math:`w` with coordinate decent: for each entry of :math:`w`, -the function solves for it optimal value while keeping the others -fixed. Since the algorithm iterates until convergence, -Lasso doesn't operate in a fixed number of steps based solely -on the dataset's size, which can make it take longer to run. -In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps. - -The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm -(see [1]_) to compute the Lasso solution in -:math:`\\min \\left\\{ -n_{\\operatorname{sample}}-1,n_{\\operatorname{feature}} -\\right\\}` -steps. This provides an efficient algorithm for computing the entire Lasso path, and -is implemented as :func:`~sklearn.linear_model.LassoLars` -and :func:`~sklearn.linear_model.lars_path`. - -We now present the visualisation of the regularization path for the diabetes dataset. -Each model is represented by 10 curves, corresponding to the number of features in the -dataset. Each curve shows how a particular coefficient :math:`w_i` changes as -:math:`\\alpha` increases. - -- In the "Lasso vs LARS Paths" visual, the Lasso and LARS paths appear identical because - both models solve the same constrained problem. However, LARS reaches the solution - faster than Lasso. - -- The "Lasso vs Elastic-Net Paths" visual is more notable. Elastic Net's coefficients - tend to have smaller absolute values than those of Lasso. Additionally, Elastic Net - maintains more non-zero coefficients than Lasso towards the end. This demonstrates - how the :math:`\\ell^1`-norm constraint encourages sparsity in the solution, while - combining it with the :math:`\\ell^2`-norm provides a balanced compromise. - - - -.. rubric:: References - -.. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R., - Friedman J., Algorithm 3.2, p. 74, 2008. - """ - +# %% import matplotlib.pyplot as plt from sklearn.datasets import load_diabetes from sklearn.linear_model import enet_path, lars_path, lasso_path +# %% +# The Diabetes Dataset +# ------------------ +# +# We use the diabetes dataset :func:`~sklearn.datasets.load_diabetes` to plot +# the regression coefficients for Lasso and Elastic Net. + X, y = load_diabetes(return_X_y=True) X /= X.std(axis=0) # Standardize data (easier to set the l1_ratio parameter) -# Compute paths +# %% +# Scikit-learn provides the following functions to compute multiple +# :math:`w` values for various :math:`\\alpha` values efficiently: +# +# - :func:`~sklearn.linear_model.lasso_path` +# - :func:`~sklearn.linear_model.lars_path` +# - :func:`~sklearn.linear_model.enet_path` +# eps = 5e-3 # the smaller it is the longer is the path -print("Computing regularization path using the lasso...") alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps) -print("Computing regularization path using the positive lasso...") -alphas_positive_lasso, coefs_positive_lasso, _ = lasso_path( - X, y, eps=eps, positive=True -) - -print("Computing regularization path using the LARS...") -alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso") - -print("Computing regularization path using the positive LARS...") -alphas_positive_lars, _, coefs_positive_lars = lars_path( - X, y, method="lasso", positive=True -) - -print("Computing regularization path using the elastic net...") alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.8) -print("Computing regularization path using the positive elastic net...") -alphas_positive_enet, coefs_positive_enet, _ = enet_path( - X, y, eps=eps, l1_ratio=0.8, positive=True -) +# %% +# The :func:`~sklearn.linear_model.lasso_path` and +# :func:`~sklearn.linear_model.enet_path` functions compute +# :math:`w` with coordinate decent: for each entry of :math:`w`, +# the function solves for it optimal value while keeping the others +# fixed. Since the algorithm iterates until convergence, +# Lasso doesn't operate in a fixed number of steps based solely +# on the dataset's size, which can make it take longer to run. +# In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps. -# Display results +alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso") + +# %% +# The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm +# (see [1]_) to compute the Lasso solution in +# :math:`\\min \\left\\{ +# n_{\\operatorname{sample}}-1,n_{\\operatorname{feature}} +# \\right\\}` +# steps. This provides an efficient algorithm for computing the entire Lasso path, and +# is implemented as :func:`~sklearn.linear_model.LassoLars` +# and :func:`~sklearn.linear_model.lars_path`. +# +# We now present the visualisation of the regularization path for the diabetes dataset. +# Each model is represented by 10 curves, corresponding to the number of features in the +# dataset. Each curve shows how a particular coefficient :math:`w_i` changes as +# :math:`\\alpha` increases. +# +# Lasso vs Lasso-LARS +# ------------------- +# In the "Lasso vs LARS Paths" visual, +# +# .. _marginal_dependencies: plt.figure(1) for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars): @@ -155,6 +130,16 @@ plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right") plt.axis("tight") +# %% +# the Lasso and LARS paths appear identical because both models solve +# the same constrained problem. However, LARS reaches the solution faster than Lasso. +# +# Lasso vs Elastic-Net +# -------------------- +# The "Lasso vs Elastic-Net Paths" visual is more notable. +# +# .. _marginal_dependencies: + plt.figure(2) for coef_l, coef_e in zip(coefs_lasso, coefs_enet): l1 = plt.semilogx(alphas_lasso, coef_l, c="#0072B2") @@ -166,38 +151,14 @@ plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right") plt.axis("tight") -plt.figure(3) -for coef_l, coef_pl in zip(coefs_lasso, coefs_positive_lasso): - l1 = plt.semilogy(alphas_lasso, coef_l, c="#0072B2") - l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c="#D55E00") - -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Lasso vs Positive Lasso Paths") -plt.legend((l1[-1], l2[-1]), ("Lasso", "positive Lasso"), loc="lower right") -plt.axis("tight") - -plt.figure(4) -for coef_lars, coef_positive_lars in zip(coefs_lars, coefs_positive_lars): - l1 = plt.semilogx(alphas_lars, coef_lars, c="#0072B2") - l2 = plt.semilogx( - alphas_positive_lars, coef_positive_lars, linestyle="--", c="#D55E00" - ) - -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("LARS vs Positive LARS Paths") -plt.legend((l1[-1], l2[-1]), ("LARS", "Positive LARS"), loc="lower right") -plt.axis("tight") - -plt.figure(5) -for coef_e, coef_pe in zip(coefs_enet, coefs_positive_enet): - l1 = plt.semilogx(alphas_enet, coef_e, c="#0072B2") - l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c="#D55E00") - -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Elastic-Net vs Positive Elastic-Net Paths") -plt.legend((l1[-1], l2[-1]), ("Elastic-Net", "positive Elastic-Net"), loc="lower right") -plt.axis("tight") -plt.show() +# %% +# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso. +# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards the +# end. This demonstrates how the :math:`\\ell^1`-norm constraint encourages sparsity in +# the solution, while combining it with the :math:`\\ell^2`-norm provides a balanced +# compromise. +# +# .. rubric:: References +# +# .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R., +# Friedman J., Algorithm 3.2, p. 74, 2008. From b24673db4883f663fea807629c3d303884b261c8 Mon Sep 17 00:00:00 2001 From: virchan Date: Mon, 7 Oct 2024 16:00:40 -0700 Subject: [PATCH 06/13] Fix styling. --- .../plot_lasso_lasso_lars_elasticnet_path.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index d554eb4310675..a74ce42a46e1d 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -63,7 +63,7 @@ # %% # The Diabetes Dataset -# ------------------ +# -------------------- # # We use the diabetes dataset :func:`~sklearn.datasets.load_diabetes` to plot # the regression coefficients for Lasso and Elastic Net. @@ -73,7 +73,7 @@ # %% # Scikit-learn provides the following functions to compute multiple -# :math:`w` values for various :math:`\\alpha` values efficiently: +# :math:`w` values for various :math:`\alpha` values efficiently: # # - :func:`~sklearn.linear_model.lasso_path` # - :func:`~sklearn.linear_model.lars_path` @@ -101,9 +101,9 @@ # %% # The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm # (see [1]_) to compute the Lasso solution in -# :math:`\\min \\left\\{ -# n_{\\operatorname{sample}}-1,n_{\\operatorname{feature}} -# \\right\\}` +# :math:`\min \left\{ +# n_{\operatorname{sample}}-1,n_{\operatorname{feature}} +# \right\}` # steps. This provides an efficient algorithm for computing the entire Lasso path, and # is implemented as :func:`~sklearn.linear_model.LassoLars` # and :func:`~sklearn.linear_model.lars_path`. @@ -111,13 +111,11 @@ # We now present the visualisation of the regularization path for the diabetes dataset. # Each model is represented by 10 curves, corresponding to the number of features in the # dataset. Each curve shows how a particular coefficient :math:`w_i` changes as -# :math:`\\alpha` increases. +# :math:`\alpha` increases. # # Lasso vs Lasso-LARS # ------------------- # In the "Lasso vs LARS Paths" visual, -# -# .. _marginal_dependencies: plt.figure(1) for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars): @@ -129,6 +127,7 @@ plt.title("Lasso vs LARS Paths") plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right") plt.axis("tight") +_ = plt.show() # %% # the Lasso and LARS paths appear identical because both models solve @@ -137,8 +136,6 @@ # Lasso vs Elastic-Net # -------------------- # The "Lasso vs Elastic-Net Paths" visual is more notable. -# -# .. _marginal_dependencies: plt.figure(2) for coef_l, coef_e in zip(coefs_lasso, coefs_enet): @@ -150,12 +147,13 @@ plt.title("Lasso vs Elastic-Net Paths") plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right") plt.axis("tight") +_ = plt.show() # %% # Elastic Net's coefficients tend to have smaller absolute values than those of Lasso. # Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards the -# end. This demonstrates how the :math:`\\ell^1`-norm constraint encourages sparsity in -# the solution, while combining it with the :math:`\\ell^2`-norm provides a balanced +# end. This demonstrates how the :math:`\ell^1`-norm constraint encourages sparsity in +# the solution, while combining it with the :math:`\ell^2`-norm provides a balanced # compromise. # # .. rubric:: References From 7fe7e30ba059594bd96076fb1b5f153b7d61f026 Mon Sep 17 00:00:00 2001 From: virchan Date: Mon, 7 Oct 2024 17:05:56 -0700 Subject: [PATCH 07/13] Fix styling. --- examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index a74ce42a46e1d..5ab24db441e74 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -18,7 +18,7 @@ - :func:`~sklearn.linear_model.Lasso` - :func:`~sklearn.linear_model.LassoLars` -- :func:`~sklearn.linear_model.ElasticNet` with default `l1_ratio=0.5` +- :func:`~sklearn.linear_model.ElasticNet` Mathematically, these are formulated by minimising the constrained least-squares penalty: From f2d2460591652bc53426179e2b01da7c8fcac1d7 Mon Sep 17 00:00:00 2001 From: virchan Date: Tue, 8 Oct 2024 17:33:50 -0700 Subject: [PATCH 08/13] Fix `l1_ratio`. --- .../linear_model/plot_lasso_lasso_lars_elasticnet_path.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index 5ab24db441e74..6bccdd00dd9e6 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -18,7 +18,7 @@ - :func:`~sklearn.linear_model.Lasso` - :func:`~sklearn.linear_model.LassoLars` -- :func:`~sklearn.linear_model.ElasticNet` +- :func:`~sklearn.linear_model.ElasticNet` with `l1_ratio=0.8` Mathematically, these are formulated by minimising the constrained least-squares penalty: @@ -30,8 +30,8 @@ \\left\\{ \\begin{array}{cl} \\alpha \\vert \\vert w \\vert \\vert_1 & \\mbox{Lasso(-LARS)} \\\\ - \\frac{\\alpha}{2} \\vert \\vert w \\vert \\vert_1 + - \\frac{\\alpha}{4} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\ + \\frac{4\\alpha}{5} \\vert \\vert w \\vert \\vert_1 + + \\frac{\\alpha}{10} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\ \\end{array} \\right. @@ -77,7 +77,7 @@ # # - :func:`~sklearn.linear_model.lasso_path` # - :func:`~sklearn.linear_model.lars_path` -# - :func:`~sklearn.linear_model.enet_path` +# - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.8` # eps = 5e-3 # the smaller it is the longer is the path From a2e31f0443a81038de034b9182e5325b363c0842 Mon Sep 17 00:00:00 2001 From: virchan Date: Mon, 18 Nov 2024 23:45:53 +0800 Subject: [PATCH 09/13] Fix typos. --- .../linear_model/plot_lasso_lasso_lars_elasticnet_path.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index 6bccdd00dd9e6..d55d2e921dff6 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -6,7 +6,7 @@ Lasso, Lasso-LARS, and Elastic Net paths ======================================== -This example shows how to compute the "paths" of coefficients along the Lasso, +This example shows how to compute the Lasso, Lasso-LARS, and Elastic Net regularization paths. It illustrates the relationship between the regularization parameter :math:`\\alpha` and the coefficients :math:`w`. @@ -65,7 +65,7 @@ # The Diabetes Dataset # -------------------- # -# We use the diabetes dataset :func:`~sklearn.datasets.load_diabetes` to plot +# We use the :func:`diabetes dataset ` to plot # the regression coefficients for Lasso and Elastic Net. X, y = load_diabetes(return_X_y=True) @@ -108,7 +108,7 @@ # is implemented as :func:`~sklearn.linear_model.LassoLars` # and :func:`~sklearn.linear_model.lars_path`. # -# We now present the visualisation of the regularization path for the diabetes dataset. +# We now present the visualisation of the regularization paths for the diabetes dataset. # Each model is represented by 10 curves, corresponding to the number of features in the # dataset. Each curve shows how a particular coefficient :math:`w_i` changes as # :math:`\alpha` increases. From 5bcfaf8f356378150551f62b1b861d672d3474e0 Mon Sep 17 00:00:00 2001 From: virchan Date: Wed, 12 Feb 2025 21:19:35 -0800 Subject: [PATCH 10/13] update 'plot_lasso_lasso_lars_elasticnet_path.py' --- .../plot_lasso_lasso_lars_elasticnet_path.py | 503 ++++++++++++++---- 1 file changed, 401 insertions(+), 102 deletions(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index d55d2e921dff6..0fe8534e0fd78 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -2,76 +2,153 @@ # SPDX-License-Identifier: BSD-3-Clause """ -======================================== -Lasso, Lasso-LARS, and Elastic Net paths -======================================== - -This example shows how to compute the Lasso, -Lasso-LARS, and Elastic Net regularization paths. It illustrates the -relationship between the regularization parameter :math:`\\alpha` -and the coefficients :math:`w`. - -When performing linear regression on a given dataset -:math:`(X, y)`, regularization terms can be added to -control the model's complexity. -Scikit-learn provides the following regularization techniques: - -- :func:`~sklearn.linear_model.Lasso` -- :func:`~sklearn.linear_model.LassoLars` -- :func:`~sklearn.linear_model.ElasticNet` with `l1_ratio=0.8` - -Mathematically, these are formulated by minimising the constrained -least-squares penalty: - -.. math:: - - \\min_{w} \\frac{1}{2n_{\\operatorname{sample}}} - \\vert \\vert Xw - y \\vert \\vert^2_2 + - \\left\\{ - \\begin{array}{cl} - \\alpha \\vert \\vert w \\vert \\vert_1 & \\mbox{Lasso(-LARS)} \\\\ - \\frac{4\\alpha}{5} \\vert \\vert w \\vert \\vert_1 + - \\frac{\\alpha}{10} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\ - \\end{array} - \\right. - -Thus, the Lasso model includes the :math:`\\ell^1`-norm of the regression -coefficients in the penalty, while the Elastic Net model -incorporates both :math:`\\ell^1`- and :math:`\\ell^2`-norms. - -Any solution to this optimisation problem depends on :math:`\\alpha`. -For example, in Lasso, a large :math:`\\alpha` forces the least-squares -penalty to stay small, which in turn keeps the norm -:math:`\\vert \\vert w \\vert \\vert_1` -small. Conversely, a smaller :math:`\\alpha` allows the norm -:math:`\\vert \\vert w \\vert \\vert_1` -to grow larger. - -This suggests that the regression coefficients :math:`w` evolve as -:math:`\\alpha` increases, and we are interested in knowing -:math:`w` across a range of :math:`\\alpha` values. This is known -as the **regularization path**: a list of :math:`w` values corresponding to -different :math:`\\alpha` values, ranging from small to large. -In this example, we plot the regularization paths to show how the sizes of the -coefficients change as the regularization parameter increases. +================================================= +Regularization in Linear and Logistic Regressions +================================================= + +This example explores regularization techniques for linear and logistic regression +in both regression and classification tasks. It demonstrates how the +regularization parameter :math:`\\alpha` can be adjusted to control the complexity +of the trained coefficients :math:`w` and reduce overfitting. """ -# %% + import matplotlib.pyplot as plt +import numpy as np -from sklearn.datasets import load_diabetes -from sklearn.linear_model import enet_path, lars_path, lasso_path +from sklearn.datasets import make_classification, make_regression +from sklearn.linear_model import ( + LogisticRegression, + Ridge, + enet_path, + lars_path, + lasso_path, +) +from sklearn.metrics import mean_squared_error +from sklearn.svm import l1_min_c # %% -# The Diabetes Dataset -# -------------------- # -# We use the :func:`diabetes dataset ` to plot -# the regression coefficients for Lasso and Elastic Net. +# Regularization in Linear Regression +# ----------------------------------- +# +# When performing linear regression on a given dataset +# :math:`(X, y)`, regularization terms can be added to +# control the model's complexity and mitigate overfitting. +# Scikit-learn provides the following regularization techniques: +# +# - :func:`~sklearn.linear_model.Lasso` +# - :func:`~sklearn.linear_model.LassoLars` +# - :func:`~sklearn.linear_model.Ridge` +# - :func:`~sklearn.linear_model.ElasticNet` (with `l1_ratio=0.8` +# in this example) +# +# Mathematically, these are formulated by minimizing the constrained +# least-squares penalty: +# +# .. math:: +# +# \min_{w} \frac{1}{2n_{\operatorname{sample}}} +# \Vert Xw - y \Vert^2_2 + +# \left\{ +# \begin{array}{cl} +# \alpha \Vert w \Vert_1 & \mbox{Lasso(-LARS)} \\ +# \alpha \Vert w \Vert_2^2 & \mbox{Ridge} \\ +# \frac{4\alpha}{5} \Vert w \Vert_1 + +# \frac{\alpha}{10} \Vert w \Vert^2_2& \mbox{Elastic Net} \\ +# \end{array} +# \right. +# +# Thus, the Lasso model (resp. Ridge model) includes the :math:`\ell^1`-norm +# (resp. :math:`\ell^2`-norm) of the regression +# coefficients in the penalty, while the Elastic Net model +# incorporates both :math:`\ell^1`- and :math:`\ell^2`-norms. +# +# We can interpret the :math:`\ell^p`-norms as minimising the least squares penalty +# under different geometries. This is illustrated by plotting the unit circles +# +# .. math:: +# +# \left\{ +# \begin{array}{cl} +# \Vert w \Vert_1 &=1 \\ +# \Vert w \Vert_2^2 &=1 \\ +# 0.8 \Vert w \Vert_1 + 0.1 \Vert w \Vert_2^2 &= 1 \\ +# \end{array} +# \right. +# +# in :math:`\mathbb{R}^2`: +line = np.linspace(-1.2, 1.2, 1001) +xx, yy = np.meshgrid(line, line) -X, y = load_diabetes(return_X_y=True) -X /= X.std(axis=0) # Standardize data (easier to set the l1_ratio parameter) +l1 = np.abs(xx) + np.abs(yy) +l2 = xx**2 + yy**2 +elastic_net = 0.8 * l1 + 0.1 * l2 + +plt.figure() +ax = plt.gca() + +l1_contour = plt.contour(xx, yy, l1, levels=[1], colors="#0072B2") +l2_contour = plt.contour(xx, yy, l2, levels=[1], colors="#D55E00") +elastic_net_contour = plt.contour(xx, yy, elastic_net, levels=[1], colors="#009E73") + +ax.set_aspect("equal") +ax.spines["left"].set_position("center") +ax.spines["right"].set_color("none") +ax.spines["bottom"].set_position("center") +ax.spines["top"].set_color("none") + +plt.clabel(l1_contour, inline=1, fmt={1.0: r"$\ell^1$"}, manual=[(-1, -1)]) +plt.clabel(l2_contour, inline=1, fmt={1.0: r"$\ell^2$"}, manual=[(-1, -1)]) +plt.clabel( + elastic_net_contour, + inline=1, + fmt={1.0: "Elastic Net"}, + manual=[(1, -1)], +) + +plt.title(r"Unit Circles in $\mathbb{R}^2$") + +plt.tight_layout() +_ = plt.show() # %% +# Algebraically, any solution to this optimization problem depends +# on :math:`\alpha`. For example, in Lasso, a large :math:`\alpha` forces +# the least-squares penalty to stay small, which in turn keeps the norm +# :math:`\Vert w \Vert_1` +# small. Conversely, a smaller :math:`\alpha` allows the norm +# :math:`\Vert w \Vert_1` +# to grow larger. +# +# This suggests that the regression coefficients :math:`w` evolve as +# :math:`\alpha` increases, and we are interested in knowing +# :math:`w` across a range of :math:`\alpha` values. This is known +# as the **regularization path**: a list of :math:`w` values corresponding to +# different :math:`\alpha` values, ranging from small to large. +# +# In this example, we plot the regularization paths to show how the magnitudes of +# the coefficients change as the regularization parameter :math:`\alpha` increases. +# This demonstrates how model complexity varies with :math:`\alpha`. We then compare +# the trained coefficients with the true coefficients used to generate the training set, +# illustrating how regularization helps mitigate overfitting. +# +# Creating a Noise-free Regression Dataset +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# We generate a toy dataset with 400 samples and 10 features, suitable for +# regression analysis. Since the data is noise-free in this example, +# we can expect our regression model to recover the true coefficients `w` exactly. + +X, y, w = make_regression(n_samples=400, n_features=10, coef=True, random_state=42) + +# %% +# +# Impact of Regularization Parameter on Model Complexity +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Lasso(-LARS) and Elastic Net Models +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# # Scikit-learn provides the following functions to compute multiple # :math:`w` values for various :math:`\alpha` values efficiently: # @@ -80,7 +157,7 @@ # - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.8` # -eps = 5e-3 # the smaller it is the longer is the path +eps = 3e-4 # the smaller it is the longer is the path alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps) @@ -89,7 +166,7 @@ # %% # The :func:`~sklearn.linear_model.lasso_path` and # :func:`~sklearn.linear_model.enet_path` functions compute -# :math:`w` with coordinate decent: for each entry of :math:`w`, +# :math:`w` with **coordinate decent**: for each entry of :math:`w`, # the function solves for it optimal value while keeping the others # fixed. Since the algorithm iterates until convergence, # Lasso doesn't operate in a fixed number of steps based solely @@ -99,8 +176,8 @@ alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso") # %% -# The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm -# (see [1]_) to compute the Lasso solution in +# The Lasso-LARS model uses the **Least Angle Regression (LARS)** algorithm +# (see [1]_ Algorithm 3.2 on page 74) to compute the Lasso solution in # :math:`\min \left\{ # n_{\operatorname{sample}}-1,n_{\operatorname{feature}} # \right\}` @@ -108,55 +185,277 @@ # is implemented as :func:`~sklearn.linear_model.LassoLars` # and :func:`~sklearn.linear_model.lars_path`. # -# We now present the visualisation of the regularization paths for the diabetes dataset. +# Ridge Model +# ~~~~~~~~~~~ +# +# Next, we compute the coefficients for the Ridge model using the :math:`\alpha` +# from Elastic Net: + +coefs_ridge = [] +for a in alphas_enet: + ridge = Ridge(alpha=a) + ridge.fit(X, y) + coefs_ridge.append(ridge.coef_) + +coefs_ridge = np.asarray(coefs_ridge) + +# %% +# Plotting the Regularization Paths +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We now present the visualization of the regularization paths for the dataset. # Each model is represented by 10 curves, corresponding to the number of features in the # dataset. Each curve shows how a particular coefficient :math:`w_i` changes as # :math:`\alpha` increases. -# -# Lasso vs Lasso-LARS -# ------------------- -# In the "Lasso vs LARS Paths" visual, -plt.figure(1) -for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars): - l1 = plt.semilogx(alphas_lasso, coef_lasso, c="#0072B2") - l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c="#D55E00") +model_names = ["Lasso", "Lasso-LARS", "Elastic Net", "Ridge"] -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Lasso vs LARS Paths") -plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right") -plt.axis("tight") +model_coefficients = [coefs_lasso.T, coefs_lars.T, coefs_enet.T, coefs_ridge] + +model_alphas = [alphas_lasso, alphas_lars, alphas_enet, alphas_enet] + +fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(25, 10)) + +for i in range(len(model_names)): + for j in range(len(model_names)): + if i == j: + axes[i, i].semilogx(model_alphas[i], model_coefficients[i], c="#0072B2") + + axes[i, i].set_title(f"{model_names[i]} Paths", fontsize=14) + + elif j < i: + l1 = axes[i, j].semilogx( + model_alphas[i], model_coefficients[i], c="#0072B2" + ) + + l2 = axes[i, j].semilogx( + model_alphas[j], model_coefficients[j], linestyle="--", c="#D55E00" + ) + + axes[i, j].set_title( + f"{model_names[j]} vs {model_names[i]} Paths", fontsize=14 + ) + + axes[i, j].legend( + (l1[-1], l2[-1]), + (f"{model_names[i]}", f"{model_names[j]}"), + loc="upper right", + ) + + else: + fig.delaxes(axes[i, j]) + +fig.supxlabel(r"$\alpha$", fontsize=18) +fig.supylabel("Coefficients", fontsize=18) + +fig.suptitle( + "Comparing Regularization Paths: Lasso(-LARS), Ridge, and Elastic Net", fontsize=20 +) + +fig.tight_layout(pad=3.0) _ = plt.show() # %% -# the Lasso and LARS paths appear identical because both models solve -# the same constrained problem. However, LARS reaches the solution faster than Lasso. -# -# Lasso vs Elastic-Net -# -------------------- -# The "Lasso vs Elastic-Net Paths" visual is more notable. - -plt.figure(2) -for coef_l, coef_e in zip(coefs_lasso, coefs_enet): - l1 = plt.semilogx(alphas_lasso, coef_l, c="#0072B2") - l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c="#D55E00") - -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Lasso vs Elastic-Net Paths") -plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right") +# +# * In the "Lasso vs Lasso-LARS Paths" visual, +# the Lasso and Lasso-LARS paths appear identical towards the end +# because both models solve the same constrained problem. +# However, Lasso-LARS reaches the solution faster than Lasso. +# +# * The "Lasso vs Elastic-Net Paths" visual is more notable. +# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso. +# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards +# the end. +# +# * In the "Lasso(-LARS) vs Ridge Paths" and "Elastic Net vs Ridge Paths" visuals, the +# Ridge model focuses on shrinking all coefficients uniformly, rather than setting +# some to exactly zero. As a result, the Ridge model retains all features after +# training, unlike the Lasso(-LARS) or Elastic Net models. +# +# This demonstrates how different regularization techniques govern +# the model's complexity: +# +# 1. the :math:`\ell^1`-norm constraint encourages sparsity in the solution. +# +# 2. the :math:`\ell^2`-norm constraint focuses on shrinkage of the magnitude +# of the solution. +# +# 3. the Elastic Net constraint provides a balanced compromise. +# +# Mitigating Overfitting with Regularization +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Recall that the true coefficient `w` refers to the coefficients of the linear model +# used to generate the training dataset. In this section, we compare the trained +# coefficients of Lasso(-LARS), Ridge, and Elastic Net with `w` to demonstrate how +# regularization can mitigate overfitting. This is achieved by computing the +# :func:`~sklearn.metrics.mean_squared_error` (MSE) between the true and trained +# coefficients. + +lasso_mse, lars_mse, enet_mse, ridge_mse = [], [], [], [] + +for coef_lasso, coef_enet, coef_ridge in zip(coefs_lasso.T, coefs_enet.T, coefs_ridge): + lasso_mse.append(mean_squared_error(coef_lasso, w)) + enet_mse.append(mean_squared_error(coef_enet, w)) + ridge_mse.append(mean_squared_error(coef_ridge, w)) + +for coef_lars in coefs_lars.T: + lars_mse.append(mean_squared_error(coef_lars, w)) + +lasso_mse = np.asarray(lasso_mse) +lars_mse = np.asarray(lars_mse) +enet_mse = np.asarray(enet_mse) +ridge_mse = np.asarray(ridge_mse) + +# %% +# +# The idea is that a smaller MSE between the true and trained coefficients implies +# greater similarity between the coefficients. Thus, if the MSE is small, the +# trained model captures the underlying pattern of the training data well. +# However, this can also indicate that the trained model may not perform well on +# generalised data, as the pattern may not hold for unseen data. +# This is essentially the overfitting problem. +# +# The following visualization demonstrates how the MSE changes for different trained +# models as the regularization parameter :math:`\alpha` increases. + +plt.figure() +l1 = plt.semilogx(alphas_lasso, lasso_mse.T, c="#0072B2") +l2 = plt.semilogx(alphas_lars, lars_mse.T, c="#D55E00") +l3 = plt.semilogx(alphas_enet, enet_mse.T, c="#009E73") +l4 = plt.semilogx(alphas_enet, ridge_mse, c="#F0E442") + +plt.xlabel(r"$\alpha$") +plt.ylabel("Mean Squared Error") +plt.title("Coefficient Error Across Regularization Strengths") +plt.legend( + (l1[-1], l2[-1], l3[-1], l4[-1]), + ("Lasso", "LARS", "Elastic Net", "Ridge"), + loc="upper left", +) + plt.axis("tight") _ = plt.show() # %% -# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso. -# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards the -# end. This demonstrates how the :math:`\ell^1`-norm constraint encourages sparsity in -# the solution, while combining it with the :math:`\ell^2`-norm provides a balanced -# compromise. +# +# In the visualization, for small values of :math:`\alpha`, since our synthetic data is +# noise-free, the trained coefficients of Lasso(-LARS), Ridge, and Elastic Net are +# similar to the true coefficient `w` (with MSE close to 0). This indicates that the +# models capture the intricate details of the training data well. +# +# As :math:`\alpha` increases, the MSE also increases. This improves the models' ability +# to generalise to unseen data (e.g., if the data were noisy), but it also risks +# degrading model performance if the regularization becomes too strong. +# +# Regularization in Logistic Regression +# ------------------------------------- +# +# Regularization can also be applied to Logistic Regression when working on +# classification tasks. scikit-learn's :func:`~sklearn.linear_model.LogisticRegression` +# enables users to apply regularization using the `penalty` parameter: +# +# * `l1`: :math:`\ell^1`-regularization, similar to the Lasso model +# * `l2`: :math:`\ell^2`-regularization, similar to the Ridge model +# * `elasticnet`: Combined with the `l1_ratio` parameter for a mix of :math:`\ell^1` +# and :math:`\ell^2` +# +# Additionally, the `C` parameter controls the inverse of the regularization strength. +# Smaller values of `C` apply stronger regularization. +# +# We demonstrate the effect of regularization by creating a synthetic classification +# dataset. +# + +X, y = make_classification( + n_samples=400, + n_features=64, + n_informative=64, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=42, +) + +# %% +# +# In this synthetic binary classification dataset, there are 400 samples, +# each with 64 features. This toy dataset is noise-free to maintain consistency with +# our earlier regression example. +# +# As noted in the regression example, :math:`\ell^1`-regularization may set some +# coefficients exactly to zero. For extreme values of `C`, the trained coefficients +# may even become the zero vector. To address this, scikit-learn provides the +# :func:`~sklearn.svm.l1_min_c` function, which computes the minimum value of the +# regularization strength `C` at which the model begins to learn meaningful patterns +# (i.e., some coefficients become non-zero). +# + +cs = l1_min_c(X, y, loss="log") * np.logspace(0, 10, 16) + +# %% +# +# We now plot blah-blah-blah +# + +l1_ratio = 0.8 # L1 weight in the Elastic-Net regularization + +fig, axes = plt.subplots(3, 3) + +# Set regularization parameter +for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)): + # Increase tolerance for short training time + clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga") + clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga") + clf_en_LR = LogisticRegression( + C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01 + ) + clf_l1_LR.fit(X, y) + clf_l2_LR.fit(X, y) + clf_en_LR.fit(X, y) + + coef_l1_LR = clf_l1_LR.coef_.ravel() + coef_l2_LR = clf_l2_LR.coef_.ravel() + coef_en_LR = clf_en_LR.coef_.ravel() + + # coef_l1_LR contains zeros due to the + # L1 sparsity inducing norm + + sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100 + sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100 + sparsity_en_LR = np.mean(coef_en_LR == 0) * 100 + + print(f"C={C:.2f}") + print(f"{'Sparsity with L1 penalty:':<40} {sparsity_l1_LR:.2f}%") + print(f"{'Sparsity with Elastic-Net penalty:':<40} {sparsity_en_LR:.2f}%") + print(f"{'Sparsity with L2 penalty:':<40} {sparsity_l2_LR:.2f}%") + print(f"{'Score with L1 penalty:':<40} {clf_l1_LR.score(X, y):.2f}") + print(f"{'Score with Elastic-Net penalty:':<40} {clf_en_LR.score(X, y):.2f}") + print(f"{'Score with L2 penalty:':<40} {clf_l2_LR.score(X, y):.2f}") + + if i == 0: + axes_row[0].set_title(r"$\ell^1$ penalty") + axes_row[1].set_title(f"Elastic-Net\n {l1_ratio = }") + axes_row[2].set_title(r"$\ell^2$ penalty") + + for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]): + ax.imshow( + np.abs(coefs.reshape(8, 8)), + interpolation="nearest", + cmap="binary", + vmax=1, + vmin=0, + ) + ax.set_xticks(()) + ax.set_yticks(()) + + axes_row[0].set_ylabel(f"C = {C:.2f}") + +_ = plt.show() +# %% # # .. rubric:: References # -# .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R., -# Friedman J., Algorithm 3.2, p. 74, 2008. +# .. [1] Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical +# Learning: Data Mining, Inference, and Prediction. New York, +# NY: Springer New York. From add4294969f4f8f7ad14d1ca3e251e11e287586b Mon Sep 17 00:00:00 2001 From: virchan Date: Mon, 3 Mar 2025 18:42:10 -0800 Subject: [PATCH 11/13] Revert "update 'plot_lasso_lasso_lars_elasticnet_path.py'" This reverts commit 5bcfaf8f356378150551f62b1b861d672d3474e0. --- .../plot_lasso_lasso_lars_elasticnet_path.py | 503 ++++-------------- 1 file changed, 102 insertions(+), 401 deletions(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index 0fe8534e0fd78..d55d2e921dff6 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -2,153 +2,76 @@ # SPDX-License-Identifier: BSD-3-Clause """ -================================================= -Regularization in Linear and Logistic Regressions -================================================= - -This example explores regularization techniques for linear and logistic regression -in both regression and classification tasks. It demonstrates how the -regularization parameter :math:`\\alpha` can be adjusted to control the complexity -of the trained coefficients :math:`w` and reduce overfitting. +======================================== +Lasso, Lasso-LARS, and Elastic Net paths +======================================== + +This example shows how to compute the Lasso, +Lasso-LARS, and Elastic Net regularization paths. It illustrates the +relationship between the regularization parameter :math:`\\alpha` +and the coefficients :math:`w`. + +When performing linear regression on a given dataset +:math:`(X, y)`, regularization terms can be added to +control the model's complexity. +Scikit-learn provides the following regularization techniques: + +- :func:`~sklearn.linear_model.Lasso` +- :func:`~sklearn.linear_model.LassoLars` +- :func:`~sklearn.linear_model.ElasticNet` with `l1_ratio=0.8` + +Mathematically, these are formulated by minimising the constrained +least-squares penalty: + +.. math:: + + \\min_{w} \\frac{1}{2n_{\\operatorname{sample}}} + \\vert \\vert Xw - y \\vert \\vert^2_2 + + \\left\\{ + \\begin{array}{cl} + \\alpha \\vert \\vert w \\vert \\vert_1 & \\mbox{Lasso(-LARS)} \\\\ + \\frac{4\\alpha}{5} \\vert \\vert w \\vert \\vert_1 + + \\frac{\\alpha}{10} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\ + \\end{array} + \\right. + +Thus, the Lasso model includes the :math:`\\ell^1`-norm of the regression +coefficients in the penalty, while the Elastic Net model +incorporates both :math:`\\ell^1`- and :math:`\\ell^2`-norms. + +Any solution to this optimisation problem depends on :math:`\\alpha`. +For example, in Lasso, a large :math:`\\alpha` forces the least-squares +penalty to stay small, which in turn keeps the norm +:math:`\\vert \\vert w \\vert \\vert_1` +small. Conversely, a smaller :math:`\\alpha` allows the norm +:math:`\\vert \\vert w \\vert \\vert_1` +to grow larger. + +This suggests that the regression coefficients :math:`w` evolve as +:math:`\\alpha` increases, and we are interested in knowing +:math:`w` across a range of :math:`\\alpha` values. This is known +as the **regularization path**: a list of :math:`w` values corresponding to +different :math:`\\alpha` values, ranging from small to large. +In this example, we plot the regularization paths to show how the sizes of the +coefficients change as the regularization parameter increases. """ - +# %% import matplotlib.pyplot as plt -import numpy as np -from sklearn.datasets import make_classification, make_regression -from sklearn.linear_model import ( - LogisticRegression, - Ridge, - enet_path, - lars_path, - lasso_path, -) -from sklearn.metrics import mean_squared_error -from sklearn.svm import l1_min_c +from sklearn.datasets import load_diabetes +from sklearn.linear_model import enet_path, lars_path, lasso_path # %% +# The Diabetes Dataset +# -------------------- # -# Regularization in Linear Regression -# ----------------------------------- -# -# When performing linear regression on a given dataset -# :math:`(X, y)`, regularization terms can be added to -# control the model's complexity and mitigate overfitting. -# Scikit-learn provides the following regularization techniques: -# -# - :func:`~sklearn.linear_model.Lasso` -# - :func:`~sklearn.linear_model.LassoLars` -# - :func:`~sklearn.linear_model.Ridge` -# - :func:`~sklearn.linear_model.ElasticNet` (with `l1_ratio=0.8` -# in this example) -# -# Mathematically, these are formulated by minimizing the constrained -# least-squares penalty: -# -# .. math:: -# -# \min_{w} \frac{1}{2n_{\operatorname{sample}}} -# \Vert Xw - y \Vert^2_2 + -# \left\{ -# \begin{array}{cl} -# \alpha \Vert w \Vert_1 & \mbox{Lasso(-LARS)} \\ -# \alpha \Vert w \Vert_2^2 & \mbox{Ridge} \\ -# \frac{4\alpha}{5} \Vert w \Vert_1 + -# \frac{\alpha}{10} \Vert w \Vert^2_2& \mbox{Elastic Net} \\ -# \end{array} -# \right. -# -# Thus, the Lasso model (resp. Ridge model) includes the :math:`\ell^1`-norm -# (resp. :math:`\ell^2`-norm) of the regression -# coefficients in the penalty, while the Elastic Net model -# incorporates both :math:`\ell^1`- and :math:`\ell^2`-norms. -# -# We can interpret the :math:`\ell^p`-norms as minimising the least squares penalty -# under different geometries. This is illustrated by plotting the unit circles -# -# .. math:: -# -# \left\{ -# \begin{array}{cl} -# \Vert w \Vert_1 &=1 \\ -# \Vert w \Vert_2^2 &=1 \\ -# 0.8 \Vert w \Vert_1 + 0.1 \Vert w \Vert_2^2 &= 1 \\ -# \end{array} -# \right. -# -# in :math:`\mathbb{R}^2`: -line = np.linspace(-1.2, 1.2, 1001) -xx, yy = np.meshgrid(line, line) +# We use the :func:`diabetes dataset ` to plot +# the regression coefficients for Lasso and Elastic Net. -l1 = np.abs(xx) + np.abs(yy) -l2 = xx**2 + yy**2 -elastic_net = 0.8 * l1 + 0.1 * l2 - -plt.figure() -ax = plt.gca() - -l1_contour = plt.contour(xx, yy, l1, levels=[1], colors="#0072B2") -l2_contour = plt.contour(xx, yy, l2, levels=[1], colors="#D55E00") -elastic_net_contour = plt.contour(xx, yy, elastic_net, levels=[1], colors="#009E73") - -ax.set_aspect("equal") -ax.spines["left"].set_position("center") -ax.spines["right"].set_color("none") -ax.spines["bottom"].set_position("center") -ax.spines["top"].set_color("none") - -plt.clabel(l1_contour, inline=1, fmt={1.0: r"$\ell^1$"}, manual=[(-1, -1)]) -plt.clabel(l2_contour, inline=1, fmt={1.0: r"$\ell^2$"}, manual=[(-1, -1)]) -plt.clabel( - elastic_net_contour, - inline=1, - fmt={1.0: "Elastic Net"}, - manual=[(1, -1)], -) - -plt.title(r"Unit Circles in $\mathbb{R}^2$") - -plt.tight_layout() -_ = plt.show() +X, y = load_diabetes(return_X_y=True) +X /= X.std(axis=0) # Standardize data (easier to set the l1_ratio parameter) # %% -# Algebraically, any solution to this optimization problem depends -# on :math:`\alpha`. For example, in Lasso, a large :math:`\alpha` forces -# the least-squares penalty to stay small, which in turn keeps the norm -# :math:`\Vert w \Vert_1` -# small. Conversely, a smaller :math:`\alpha` allows the norm -# :math:`\Vert w \Vert_1` -# to grow larger. -# -# This suggests that the regression coefficients :math:`w` evolve as -# :math:`\alpha` increases, and we are interested in knowing -# :math:`w` across a range of :math:`\alpha` values. This is known -# as the **regularization path**: a list of :math:`w` values corresponding to -# different :math:`\alpha` values, ranging from small to large. -# -# In this example, we plot the regularization paths to show how the magnitudes of -# the coefficients change as the regularization parameter :math:`\alpha` increases. -# This demonstrates how model complexity varies with :math:`\alpha`. We then compare -# the trained coefficients with the true coefficients used to generate the training set, -# illustrating how regularization helps mitigate overfitting. -# -# Creating a Noise-free Regression Dataset -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# We generate a toy dataset with 400 samples and 10 features, suitable for -# regression analysis. Since the data is noise-free in this example, -# we can expect our regression model to recover the true coefficients `w` exactly. - -X, y, w = make_regression(n_samples=400, n_features=10, coef=True, random_state=42) - -# %% -# -# Impact of Regularization Parameter on Model Complexity -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# Lasso(-LARS) and Elastic Net Models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# # Scikit-learn provides the following functions to compute multiple # :math:`w` values for various :math:`\alpha` values efficiently: # @@ -157,7 +80,7 @@ # - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.8` # -eps = 3e-4 # the smaller it is the longer is the path +eps = 5e-3 # the smaller it is the longer is the path alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps) @@ -166,7 +89,7 @@ # %% # The :func:`~sklearn.linear_model.lasso_path` and # :func:`~sklearn.linear_model.enet_path` functions compute -# :math:`w` with **coordinate decent**: for each entry of :math:`w`, +# :math:`w` with coordinate decent: for each entry of :math:`w`, # the function solves for it optimal value while keeping the others # fixed. Since the algorithm iterates until convergence, # Lasso doesn't operate in a fixed number of steps based solely @@ -176,8 +99,8 @@ alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso") # %% -# The Lasso-LARS model uses the **Least Angle Regression (LARS)** algorithm -# (see [1]_ Algorithm 3.2 on page 74) to compute the Lasso solution in +# The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm +# (see [1]_) to compute the Lasso solution in # :math:`\min \left\{ # n_{\operatorname{sample}}-1,n_{\operatorname{feature}} # \right\}` @@ -185,277 +108,55 @@ # is implemented as :func:`~sklearn.linear_model.LassoLars` # and :func:`~sklearn.linear_model.lars_path`. # -# Ridge Model -# ~~~~~~~~~~~ -# -# Next, we compute the coefficients for the Ridge model using the :math:`\alpha` -# from Elastic Net: - -coefs_ridge = [] -for a in alphas_enet: - ridge = Ridge(alpha=a) - ridge.fit(X, y) - coefs_ridge.append(ridge.coef_) - -coefs_ridge = np.asarray(coefs_ridge) - -# %% -# Plotting the Regularization Paths -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We now present the visualization of the regularization paths for the dataset. +# We now present the visualisation of the regularization paths for the diabetes dataset. # Each model is represented by 10 curves, corresponding to the number of features in the # dataset. Each curve shows how a particular coefficient :math:`w_i` changes as # :math:`\alpha` increases. - -model_names = ["Lasso", "Lasso-LARS", "Elastic Net", "Ridge"] - -model_coefficients = [coefs_lasso.T, coefs_lars.T, coefs_enet.T, coefs_ridge] - -model_alphas = [alphas_lasso, alphas_lars, alphas_enet, alphas_enet] - -fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(25, 10)) - -for i in range(len(model_names)): - for j in range(len(model_names)): - if i == j: - axes[i, i].semilogx(model_alphas[i], model_coefficients[i], c="#0072B2") - - axes[i, i].set_title(f"{model_names[i]} Paths", fontsize=14) - - elif j < i: - l1 = axes[i, j].semilogx( - model_alphas[i], model_coefficients[i], c="#0072B2" - ) - - l2 = axes[i, j].semilogx( - model_alphas[j], model_coefficients[j], linestyle="--", c="#D55E00" - ) - - axes[i, j].set_title( - f"{model_names[j]} vs {model_names[i]} Paths", fontsize=14 - ) - - axes[i, j].legend( - (l1[-1], l2[-1]), - (f"{model_names[i]}", f"{model_names[j]}"), - loc="upper right", - ) - - else: - fig.delaxes(axes[i, j]) - -fig.supxlabel(r"$\alpha$", fontsize=18) -fig.supylabel("Coefficients", fontsize=18) - -fig.suptitle( - "Comparing Regularization Paths: Lasso(-LARS), Ridge, and Elastic Net", fontsize=20 -) - -fig.tight_layout(pad=3.0) -_ = plt.show() - -# %% -# -# * In the "Lasso vs Lasso-LARS Paths" visual, -# the Lasso and Lasso-LARS paths appear identical towards the end -# because both models solve the same constrained problem. -# However, Lasso-LARS reaches the solution faster than Lasso. -# -# * The "Lasso vs Elastic-Net Paths" visual is more notable. -# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso. -# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards -# the end. -# -# * In the "Lasso(-LARS) vs Ridge Paths" and "Elastic Net vs Ridge Paths" visuals, the -# Ridge model focuses on shrinking all coefficients uniformly, rather than setting -# some to exactly zero. As a result, the Ridge model retains all features after -# training, unlike the Lasso(-LARS) or Elastic Net models. -# -# This demonstrates how different regularization techniques govern -# the model's complexity: -# -# 1. the :math:`\ell^1`-norm constraint encourages sparsity in the solution. -# -# 2. the :math:`\ell^2`-norm constraint focuses on shrinkage of the magnitude -# of the solution. -# -# 3. the Elastic Net constraint provides a balanced compromise. -# -# Mitigating Overfitting with Regularization -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# Recall that the true coefficient `w` refers to the coefficients of the linear model -# used to generate the training dataset. In this section, we compare the trained -# coefficients of Lasso(-LARS), Ridge, and Elastic Net with `w` to demonstrate how -# regularization can mitigate overfitting. This is achieved by computing the -# :func:`~sklearn.metrics.mean_squared_error` (MSE) between the true and trained -# coefficients. - -lasso_mse, lars_mse, enet_mse, ridge_mse = [], [], [], [] - -for coef_lasso, coef_enet, coef_ridge in zip(coefs_lasso.T, coefs_enet.T, coefs_ridge): - lasso_mse.append(mean_squared_error(coef_lasso, w)) - enet_mse.append(mean_squared_error(coef_enet, w)) - ridge_mse.append(mean_squared_error(coef_ridge, w)) - -for coef_lars in coefs_lars.T: - lars_mse.append(mean_squared_error(coef_lars, w)) - -lasso_mse = np.asarray(lasso_mse) -lars_mse = np.asarray(lars_mse) -enet_mse = np.asarray(enet_mse) -ridge_mse = np.asarray(ridge_mse) - -# %% # -# The idea is that a smaller MSE between the true and trained coefficients implies -# greater similarity between the coefficients. Thus, if the MSE is small, the -# trained model captures the underlying pattern of the training data well. -# However, this can also indicate that the trained model may not perform well on -# generalised data, as the pattern may not hold for unseen data. -# This is essentially the overfitting problem. -# -# The following visualization demonstrates how the MSE changes for different trained -# models as the regularization parameter :math:`\alpha` increases. - -plt.figure() -l1 = plt.semilogx(alphas_lasso, lasso_mse.T, c="#0072B2") -l2 = plt.semilogx(alphas_lars, lars_mse.T, c="#D55E00") -l3 = plt.semilogx(alphas_enet, enet_mse.T, c="#009E73") -l4 = plt.semilogx(alphas_enet, ridge_mse, c="#F0E442") +# Lasso vs Lasso-LARS +# ------------------- +# In the "Lasso vs LARS Paths" visual, -plt.xlabel(r"$\alpha$") -plt.ylabel("Mean Squared Error") -plt.title("Coefficient Error Across Regularization Strengths") -plt.legend( - (l1[-1], l2[-1], l3[-1], l4[-1]), - ("Lasso", "LARS", "Elastic Net", "Ridge"), - loc="upper left", -) +plt.figure(1) +for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars): + l1 = plt.semilogx(alphas_lasso, coef_lasso, c="#0072B2") + l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c="#D55E00") +plt.xlabel("alpha") +plt.ylabel("coefficients") +plt.title("Lasso vs LARS Paths") +plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right") plt.axis("tight") _ = plt.show() # %% -# -# In the visualization, for small values of :math:`\alpha`, since our synthetic data is -# noise-free, the trained coefficients of Lasso(-LARS), Ridge, and Elastic Net are -# similar to the true coefficient `w` (with MSE close to 0). This indicates that the -# models capture the intricate details of the training data well. -# -# As :math:`\alpha` increases, the MSE also increases. This improves the models' ability -# to generalise to unseen data (e.g., if the data were noisy), but it also risks -# degrading model performance if the regularization becomes too strong. -# -# Regularization in Logistic Regression -# ------------------------------------- -# -# Regularization can also be applied to Logistic Regression when working on -# classification tasks. scikit-learn's :func:`~sklearn.linear_model.LogisticRegression` -# enables users to apply regularization using the `penalty` parameter: -# -# * `l1`: :math:`\ell^1`-regularization, similar to the Lasso model -# * `l2`: :math:`\ell^2`-regularization, similar to the Ridge model -# * `elasticnet`: Combined with the `l1_ratio` parameter for a mix of :math:`\ell^1` -# and :math:`\ell^2` -# -# Additionally, the `C` parameter controls the inverse of the regularization strength. -# Smaller values of `C` apply stronger regularization. -# -# We demonstrate the effect of regularization by creating a synthetic classification -# dataset. -# - -X, y = make_classification( - n_samples=400, - n_features=64, - n_informative=64, - n_redundant=0, - n_classes=2, - n_clusters_per_class=1, - random_state=42, -) - -# %% -# -# In this synthetic binary classification dataset, there are 400 samples, -# each with 64 features. This toy dataset is noise-free to maintain consistency with -# our earlier regression example. -# -# As noted in the regression example, :math:`\ell^1`-regularization may set some -# coefficients exactly to zero. For extreme values of `C`, the trained coefficients -# may even become the zero vector. To address this, scikit-learn provides the -# :func:`~sklearn.svm.l1_min_c` function, which computes the minimum value of the -# regularization strength `C` at which the model begins to learn meaningful patterns -# (i.e., some coefficients become non-zero). -# - -cs = l1_min_c(X, y, loss="log") * np.logspace(0, 10, 16) - -# %% -# -# We now plot blah-blah-blah -# - -l1_ratio = 0.8 # L1 weight in the Elastic-Net regularization - -fig, axes = plt.subplots(3, 3) - -# Set regularization parameter -for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)): - # Increase tolerance for short training time - clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga") - clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga") - clf_en_LR = LogisticRegression( - C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01 - ) - clf_l1_LR.fit(X, y) - clf_l2_LR.fit(X, y) - clf_en_LR.fit(X, y) - - coef_l1_LR = clf_l1_LR.coef_.ravel() - coef_l2_LR = clf_l2_LR.coef_.ravel() - coef_en_LR = clf_en_LR.coef_.ravel() - - # coef_l1_LR contains zeros due to the - # L1 sparsity inducing norm - - sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100 - sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100 - sparsity_en_LR = np.mean(coef_en_LR == 0) * 100 - - print(f"C={C:.2f}") - print(f"{'Sparsity with L1 penalty:':<40} {sparsity_l1_LR:.2f}%") - print(f"{'Sparsity with Elastic-Net penalty:':<40} {sparsity_en_LR:.2f}%") - print(f"{'Sparsity with L2 penalty:':<40} {sparsity_l2_LR:.2f}%") - print(f"{'Score with L1 penalty:':<40} {clf_l1_LR.score(X, y):.2f}") - print(f"{'Score with Elastic-Net penalty:':<40} {clf_en_LR.score(X, y):.2f}") - print(f"{'Score with L2 penalty:':<40} {clf_l2_LR.score(X, y):.2f}") - - if i == 0: - axes_row[0].set_title(r"$\ell^1$ penalty") - axes_row[1].set_title(f"Elastic-Net\n {l1_ratio = }") - axes_row[2].set_title(r"$\ell^2$ penalty") - - for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]): - ax.imshow( - np.abs(coefs.reshape(8, 8)), - interpolation="nearest", - cmap="binary", - vmax=1, - vmin=0, - ) - ax.set_xticks(()) - ax.set_yticks(()) - - axes_row[0].set_ylabel(f"C = {C:.2f}") - +# the Lasso and LARS paths appear identical because both models solve +# the same constrained problem. However, LARS reaches the solution faster than Lasso. +# +# Lasso vs Elastic-Net +# -------------------- +# The "Lasso vs Elastic-Net Paths" visual is more notable. + +plt.figure(2) +for coef_l, coef_e in zip(coefs_lasso, coefs_enet): + l1 = plt.semilogx(alphas_lasso, coef_l, c="#0072B2") + l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c="#D55E00") + +plt.xlabel("alpha") +plt.ylabel("coefficients") +plt.title("Lasso vs Elastic-Net Paths") +plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right") +plt.axis("tight") _ = plt.show() + # %% +# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso. +# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards the +# end. This demonstrates how the :math:`\ell^1`-norm constraint encourages sparsity in +# the solution, while combining it with the :math:`\ell^2`-norm provides a balanced +# compromise. # # .. rubric:: References # -# .. [1] Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical -# Learning: Data Mining, Inference, and Prediction. New York, -# NY: Springer New York. +# .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R., +# Friedman J., Algorithm 3.2, p. 74, 2008. From a9c4e84f3fab3219aa64278c72d0d4361696f633 Mon Sep 17 00:00:00 2001 From: virchan Date: Mon, 3 Mar 2025 19:16:07 -0800 Subject: [PATCH 12/13] added `plot_regularization.py`, updated `doc/conf.py`. --- doc/conf.py | 22 +- examples/linear_model/plot_regularization.py | 461 +++++++++++++++++++ 2 files changed, 481 insertions(+), 2 deletions(-) create mode 100644 examples/linear_model/plot_regularization.py diff --git a/doc/conf.py b/doc/conf.py index f749b188b3274..7f8b011746852 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -477,10 +477,28 @@ def add_js_css_files(app, pagename, templatename, context, doctree): "auto_examples/model_selection/plot_nested_cross_validation_iris" ), "auto_examples/linear_model/plot_lasso_lars": ( - "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path" + "auto_examples/linear_model/plot_regularization" ), "auto_examples/linear_model/plot_lasso_coordinate_descent_path": ( - "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path" + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path": ( + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_ridge_coeffs": ( + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_ridge_path": ( + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_sgd_penalties": ( + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_logistic_path": ( + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_logistic_l1_l2_sparsity": ( + "auto_examples/linear_model/plot_regularization" ), "auto_examples/cluster/plot_color_quantization": ( "auto_examples/cluster/plot_face_compress" diff --git a/examples/linear_model/plot_regularization.py b/examples/linear_model/plot_regularization.py new file mode 100644 index 0000000000000..610a59b3736dd --- /dev/null +++ b/examples/linear_model/plot_regularization.py @@ -0,0 +1,461 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +""" +================================================= +Regularization in Linear and Logistic Regressions +================================================= + +This example explores regularization techniques for linear and logistic regression +in both regression and classification tasks. It demonstrates how the +regularization parameter :math:`\\alpha` can be adjusted to control the complexity +of the trained coefficients :math:`w` and reduce overfitting. +""" + +import matplotlib.pyplot as plt +import numpy as np + +from sklearn.datasets import make_classification, make_regression +from sklearn.linear_model import ( + LogisticRegression, + Ridge, + enet_path, + lars_path, + lasso_path, +) +from sklearn.metrics import mean_squared_error +from sklearn.svm import l1_min_c + +# %% +# +# Regularization in Linear Regression +# ----------------------------------- +# +# When performing linear regression on a given dataset +# :math:`(X, y)`, regularization terms can be added to +# control the model's complexity and mitigate overfitting. +# Scikit-learn provides the following regularization techniques: +# +# - :func:`~sklearn.linear_model.Lasso` +# - :func:`~sklearn.linear_model.LassoLars` +# - :func:`~sklearn.linear_model.Ridge` +# - :func:`~sklearn.linear_model.ElasticNet` (with `l1_ratio=0.5` +# in this example) +# +# Mathematically, these are formulated by minimizing the constrained +# least-squares penalty: +# +# .. math:: +# +# \min_{w} \frac{1}{2n_{\operatorname{sample}}} +# \Vert Xw - y \Vert^2_2 + +# \left\{ +# \begin{array}{cl} +# \alpha \Vert w \Vert_1 & \mbox{Lasso(-LARS)} \\ +# \alpha \Vert w \Vert_2^2 & \mbox{Ridge} \\ +# \frac{\alpha}{2} \Vert w \Vert_1 + +# \frac{\alpha}{4} \Vert w \Vert^2_2& \mbox{Elastic Net} \\ +# \end{array} +# \right. +# +# Thus, the Lasso model (resp. Ridge model) includes the :math:`\ell^1`-norm +# (resp. :math:`\ell^2`-norm) of the regression +# coefficients in the penalty, while the Elastic Net model +# incorporates both :math:`\ell^1`- and :math:`\ell^2`-norms. +# +# We can interpret the :math:`\ell^p`-norms as minimising the least squares penalty +# under different geometries. This is illustrated by plotting the unit circles +# +# .. math:: +# +# \left\{ +# \begin{array}{cl} +# \Vert w \Vert_1 &=1 \\ +# \Vert w \Vert_2^2 &=1 \\ +# 0.5 \Vert w \Vert_1 + 0.25 \Vert w \Vert_2^2 &= 1 \\ +# \end{array} +# \right. +# +# in :math:`\mathbb{R}^2`: +line = np.linspace(-1.2, 1.2, 1001) +xx, yy = np.meshgrid(line, line) + +l1 = np.abs(xx) + np.abs(yy) +l2 = xx**2 + yy**2 +elastic_net = 0.5 * l1 + 0.25 * l2 + +plt.figure() +ax = plt.gca() + +l1_contour = plt.contour(xx, yy, l1, levels=[1], colors="#0072B2") +l2_contour = plt.contour(xx, yy, l2, levels=[1], colors="#D55E00") +elastic_net_contour = plt.contour(xx, yy, elastic_net, levels=[1], colors="#009E73") + +ax.set_aspect("equal") +ax.spines["left"].set_position("center") +ax.spines["right"].set_color("none") +ax.spines["bottom"].set_position("center") +ax.spines["top"].set_color("none") + +plt.clabel(l1_contour, inline=1, fmt={1.0: r"$\ell^1$"}, manual=[(-1, -1)]) +plt.clabel(l2_contour, inline=1, fmt={1.0: r"$\ell^2$"}, manual=[(-1, -1)]) +plt.clabel( + elastic_net_contour, + inline=1, + fmt={1.0: "Elastic Net"}, + manual=[(1, -2)], +) + +plt.title(r"Unit Circles in $\mathbb{R}^2$") + +plt.tight_layout() +_ = plt.show() + +# %% +# Algebraically, any solution to this optimization problem depends +# on :math:`\alpha`. For example, in Lasso, a large :math:`\alpha` forces +# the least-squares penalty to stay small, which in turn keeps the norm +# :math:`\Vert w \Vert_1` +# small. Conversely, a smaller :math:`\alpha` allows the norm +# :math:`\Vert w \Vert_1` +# to grow larger. +# +# This suggests that the regression coefficients :math:`w` evolve as +# :math:`\alpha` increases, and we are interested in knowing +# :math:`w` across a range of :math:`\alpha` values. This is known +# as the **regularization path**: a list of :math:`w` values corresponding to +# different :math:`\alpha` values, ranging from small to large. +# +# In this example, we plot the regularization paths to show how the magnitudes of +# the coefficients change as the regularization parameter :math:`\alpha` increases. +# This demonstrates how model complexity varies with :math:`\alpha`. We then compare +# the trained coefficients with the true coefficients used to generate the training set, +# illustrating how regularization helps mitigate overfitting. +# +# Creating a Noise-free Regression Dataset +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# We generate a toy dataset with 400 samples and 10 features, suitable for +# regression analysis. Since the data is noise-free in this example, +# we can expect our regression model to recover the true coefficients `w` exactly. + +X, y, w = make_regression(n_samples=400, n_features=10, coef=True, random_state=42) + +# %% +# +# Impact of Regularization Parameter on Model Complexity +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Lasso(-LARS) and Elastic Net Models +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Scikit-learn provides the following functions to compute multiple +# :math:`w` values for various :math:`\alpha` values efficiently: +# +# - :func:`~sklearn.linear_model.lasso_path` +# - :func:`~sklearn.linear_model.lars_path` +# - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.5` +# + +eps = 3e-4 # the smaller it is the longer is the path + +alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps) + +alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.5) + +# %% +# The :func:`~sklearn.linear_model.lasso_path` and +# :func:`~sklearn.linear_model.enet_path` functions compute +# :math:`w` with **coordinate decent**: for each entry of :math:`w`, +# the function solves for it optimal value while keeping the others +# fixed. Since the algorithm iterates until convergence, +# Lasso doesn't operate in a fixed number of steps based solely +# on the dataset's size, which can make it take longer to run. +# In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps. + +alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso") + +# %% +# The Lasso-LARS model uses the **Least Angle Regression (LARS)** algorithm +# (see [1]_ Algorithm 3.2 on page 74) to compute the Lasso solution in +# :math:`\min \left\{ +# n_{\operatorname{sample}}-1,n_{\operatorname{feature}} +# \right\}` +# steps. This provides an efficient algorithm for computing the entire Lasso path, and +# is implemented as :func:`~sklearn.linear_model.LassoLars` +# and :func:`~sklearn.linear_model.lars_path`. +# +# Ridge Model +# ~~~~~~~~~~~ +# +# Next, we compute the coefficients for the Ridge model using the :math:`\alpha` +# from Elastic Net: + +coefs_ridge = [] +for a in alphas_enet: + ridge = Ridge(alpha=a) + ridge.fit(X, y) + coefs_ridge.append(ridge.coef_) + +coefs_ridge = np.asarray(coefs_ridge) + +# %% +# Plotting the Regularization Paths +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We now present the visualization of the regularization paths for the dataset. +# Each model is represented by 10 curves, corresponding to the number of features in the +# dataset. Each curve shows how a particular coefficient :math:`w_i` changes as +# :math:`\alpha` increases. + +model_names = ["Lasso", "Lasso-LARS", "Elastic Net", "Ridge"] + +model_coefficients = [coefs_lasso.T, coefs_lars.T, coefs_enet.T, coefs_ridge] + +model_alphas = [alphas_lasso, alphas_lars, alphas_enet, alphas_enet] + +fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(25, 10)) + +for i in range(len(model_names)): + for j in range(len(model_names)): + if i == j: + axes[i, i].semilogx(model_alphas[i], model_coefficients[i], c="#0072B2") + + axes[i, i].set_title(f"{model_names[i]} Paths", fontsize=14) + + elif j < i: + l1 = axes[i, j].semilogx( + model_alphas[i], model_coefficients[i], c="#0072B2" + ) + + l2 = axes[i, j].semilogx( + model_alphas[j], model_coefficients[j], linestyle="--", c="#D55E00" + ) + + axes[i, j].set_title( + f"{model_names[j]} vs {model_names[i]} Paths", fontsize=14 + ) + + axes[i, j].legend( + (l1[-1], l2[-1]), + (f"{model_names[i]}", f"{model_names[j]}"), + loc="upper right", + ) + + else: + fig.delaxes(axes[i, j]) + +fig.text(0.5, 0.02, r"$\alpha$", fontsize=18, ha="center") +fig.text(0, 0.5, "Coefficients", fontsize=18, va="center", rotation=90) + +fig.suptitle( + "Comparing Regularization Paths: Lasso(-LARS), Ridge, and Elastic Net", fontsize=20 +) + +fig.tight_layout(pad=3.0) +_ = plt.show() + +# %% +# +# * In the "Lasso vs Lasso-LARS Paths" visual, +# the Lasso and Lasso-LARS paths appear identical towards the end +# because both models solve the same constrained problem. +# However, Lasso-LARS reaches the solution faster than Lasso. +# +# * The "Lasso vs Elastic-Net Paths" visual is more notable. +# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso. +# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards +# the end. +# +# * In the "Lasso(-LARS) vs Ridge Paths" and "Elastic Net vs Ridge Paths" visuals, the +# Ridge model focuses on shrinking all coefficients uniformly, rather than setting +# some to exactly zero. As a result, the Ridge model retains all features after +# training, unlike the Lasso(-LARS) or Elastic Net models. +# +# This demonstrates how different regularization techniques govern +# the model's complexity: +# +# 1. the :math:`\ell^1`-norm constraint encourages sparsity in the solution. +# +# 2. the :math:`\ell^2`-norm constraint focuses on shrinkage of the magnitude +# of the solution. +# +# 3. the Elastic Net constraint provides a balanced compromise. +# +# Mitigating Overfitting with Regularization +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Recall that the true coefficient `w` refers to the coefficients of the linear model +# used to generate the training dataset. In this section, we compare the trained +# coefficients of Lasso(-LARS), Ridge, and Elastic Net with `w` to demonstrate how +# regularization can mitigate overfitting. This is achieved by computing the +# :func:`~sklearn.metrics.mean_squared_error` (MSE) between the true and trained +# coefficients. + +lasso_mse, lars_mse, enet_mse, ridge_mse = [], [], [], [] + +for coef_lasso, coef_enet, coef_ridge in zip(coefs_lasso.T, coefs_enet.T, coefs_ridge): + lasso_mse.append(mean_squared_error(coef_lasso, w)) + enet_mse.append(mean_squared_error(coef_enet, w)) + ridge_mse.append(mean_squared_error(coef_ridge, w)) + +for coef_lars in coefs_lars.T: + lars_mse.append(mean_squared_error(coef_lars, w)) + +lasso_mse = np.asarray(lasso_mse) +lars_mse = np.asarray(lars_mse) +enet_mse = np.asarray(enet_mse) +ridge_mse = np.asarray(ridge_mse) + +# %% +# +# The idea is that a smaller MSE between the true and trained coefficients implies +# greater similarity between the coefficients. Thus, if the MSE is small, the +# trained model captures the underlying pattern of the training data well. +# However, this can also indicate that the trained model may not perform well on +# generalised data, as the pattern may not hold for unseen data. +# This is essentially the overfitting problem. +# +# The following visualization demonstrates how the MSE changes for different trained +# models as the regularization parameter :math:`\alpha` increases. + +plt.figure() +l1 = plt.semilogx(alphas_lasso, lasso_mse.T, c="#0072B2") +l2 = plt.semilogx(alphas_lars, lars_mse.T, c="#D55E00") +l3 = plt.semilogx(alphas_enet, enet_mse.T, c="#009E73") +l4 = plt.semilogx(alphas_enet, ridge_mse, c="#F0E442") + +plt.xlabel(r"$\alpha$") +plt.ylabel("Mean Squared Error") +plt.title("Coefficient Error Across Regularization Strengths") +plt.legend( + (l1[-1], l2[-1], l3[-1], l4[-1]), + ("Lasso", "LARS", "Elastic Net", "Ridge"), + loc="upper left", +) + +plt.axis("tight") +_ = plt.show() + +# %% +# +# In the visualization, for small values of :math:`\alpha`, since our synthetic data is +# noise-free, the trained coefficients of Lasso(-LARS), Ridge, and Elastic Net are +# similar to the true coefficient `w` (with MSE close to 0). This indicates that the +# models capture the intricate details of the training data well. +# +# As :math:`\alpha` increases, the MSE also increases. This improves the models' ability +# to generalise to unseen data (e.g., if the data were noisy), but it also risks +# degrading model performance if the regularization becomes too strong. +# +# Regularization in Logistic Regression +# ------------------------------------- +# +# Regularization can also be applied to Logistic Regression when working on +# classification tasks. scikit-learn's :func:`~sklearn.linear_model.LogisticRegression` +# enables users to apply regularization using the `penalty` parameter: +# +# * `l1`: :math:`\ell^1`-regularization, similar to the Lasso model +# * `l2`: :math:`\ell^2`-regularization, similar to the Ridge model +# * `elasticnet`: Combined with the `l1_ratio` parameter for a mix of :math:`\ell^1` +# and :math:`\ell^2` +# +# Additionally, the `C` parameter controls the inverse of the regularization strength. +# Smaller values of `C` apply stronger regularization. +# +# We demonstrate the effect of regularization by creating a synthetic classification +# dataset. +# + +X, y = make_classification( + n_samples=400, + n_features=64, + n_informative=64, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=42, +) + +# %% +# +# In this synthetic binary classification dataset, there are 400 samples, +# each with 64 features. This toy dataset is noise-free to maintain consistency with +# our earlier regression example. +# +# As noted in the regression example, :math:`\ell^1`-regularization may set some +# coefficients exactly to zero. For extreme values of `C`, the trained coefficients +# may even become the zero vector. To address this, scikit-learn provides the +# :func:`~sklearn.svm.l1_min_c` function, which computes the minimum value of the +# regularization strength `C` at which the model begins to learn meaningful patterns +# (i.e., some coefficients become non-zero). +# + +cs = l1_min_c(X, y, loss="log") * np.logspace(0, 10, 16) + +# %% +# +# We now plot heatmaps to represent the sparsity for each `penalty` and each value +# of `C`. +# + +l1_ratio = 0.5 # l1 weight in the Elastic-Net regularization + +fig, axes = plt.subplots(3, 3) + +# Set regularization parameter +for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)): + # Increase tolerance for short training time + clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga") + clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga") + clf_en_LR = LogisticRegression( + C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01 + ) + clf_l1_LR.fit(X, y) + clf_l2_LR.fit(X, y) + clf_en_LR.fit(X, y) + + coef_l1_LR = clf_l1_LR.coef_.ravel() + coef_l2_LR = clf_l2_LR.coef_.ravel() + coef_en_LR = clf_en_LR.coef_.ravel() + + sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100 + sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100 + sparsity_en_LR = np.mean(coef_en_LR == 0) * 100 + + if i == 0: + axes_row[0].set_title(r"$\ell^1$ penalty") + axes_row[1].set_title(f"Elastic-Net\n {l1_ratio = }") + axes_row[2].set_title(r"$\ell^2$ penalty") + + for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]): + ax.imshow( + np.abs(coefs.reshape(8, 8)), + interpolation="nearest", + cmap="binary", + vmax=1, + vmin=0, + ) + ax.set_xticks(()) + ax.set_yticks(()) + + axes_row[0].set_ylabel(f"C = {C:.2f}") + +_ = plt.show() +# %% +# +# Each heatmap organizes the 64 coefficients (the number of features in our synthetic +# classification dataset) into an 8×8 grid. It is constructed by taking the absolute +# values of the coefficients and displaying them in a black-and-white scale, where +# lower values appear white and higher values appear black. +# +# We can see that larger values of `C` (i.e., weaker regularization) give the model +# more freedom, while smaller values of `C` impose stronger constraints, leading to +# increased sparsity. As expected, the Elastic-Net penalty results in a level of +# sparsity between that of :math:`\ell^1` and :math:`\ell^2`. +# +# .. rubric:: References +# +# .. [1] Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical +# Learning: Data Mining, Inference, and Prediction. New York, +# NY: Springer New York. From 8667b58adc6803fa940efdf3fb5fce4b860f0d97 Mon Sep 17 00:00:00 2001 From: virchan Date: Mon, 3 Mar 2025 19:21:54 -0800 Subject: [PATCH 13/13] added `plot_regularization.py`, updated `doc/conf.py`, and deleted multiple examples. --- .../plot_lasso_lasso_lars_elasticnet_path.py | 162 ---------------- .../plot_logistic_l1_l2_sparsity.py | 88 --------- examples/linear_model/plot_logistic_path.py | 103 ---------- examples/linear_model/plot_ridge_coeffs.py | 181 ------------------ examples/linear_model/plot_ridge_path.py | 68 ------- examples/linear_model/plot_sgd_penalties.py | 57 ------ sklearn/linear_model/_coordinate_descent.py | 8 +- 7 files changed, 4 insertions(+), 663 deletions(-) delete mode 100644 examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py delete mode 100644 examples/linear_model/plot_logistic_l1_l2_sparsity.py delete mode 100644 examples/linear_model/plot_logistic_path.py delete mode 100644 examples/linear_model/plot_ridge_coeffs.py delete mode 100644 examples/linear_model/plot_ridge_path.py delete mode 100644 examples/linear_model/plot_sgd_penalties.py diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py deleted file mode 100644 index d55d2e921dff6..0000000000000 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ /dev/null @@ -1,162 +0,0 @@ -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -""" -======================================== -Lasso, Lasso-LARS, and Elastic Net paths -======================================== - -This example shows how to compute the Lasso, -Lasso-LARS, and Elastic Net regularization paths. It illustrates the -relationship between the regularization parameter :math:`\\alpha` -and the coefficients :math:`w`. - -When performing linear regression on a given dataset -:math:`(X, y)`, regularization terms can be added to -control the model's complexity. -Scikit-learn provides the following regularization techniques: - -- :func:`~sklearn.linear_model.Lasso` -- :func:`~sklearn.linear_model.LassoLars` -- :func:`~sklearn.linear_model.ElasticNet` with `l1_ratio=0.8` - -Mathematically, these are formulated by minimising the constrained -least-squares penalty: - -.. math:: - - \\min_{w} \\frac{1}{2n_{\\operatorname{sample}}} - \\vert \\vert Xw - y \\vert \\vert^2_2 + - \\left\\{ - \\begin{array}{cl} - \\alpha \\vert \\vert w \\vert \\vert_1 & \\mbox{Lasso(-LARS)} \\\\ - \\frac{4\\alpha}{5} \\vert \\vert w \\vert \\vert_1 + - \\frac{\\alpha}{10} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\ - \\end{array} - \\right. - -Thus, the Lasso model includes the :math:`\\ell^1`-norm of the regression -coefficients in the penalty, while the Elastic Net model -incorporates both :math:`\\ell^1`- and :math:`\\ell^2`-norms. - -Any solution to this optimisation problem depends on :math:`\\alpha`. -For example, in Lasso, a large :math:`\\alpha` forces the least-squares -penalty to stay small, which in turn keeps the norm -:math:`\\vert \\vert w \\vert \\vert_1` -small. Conversely, a smaller :math:`\\alpha` allows the norm -:math:`\\vert \\vert w \\vert \\vert_1` -to grow larger. - -This suggests that the regression coefficients :math:`w` evolve as -:math:`\\alpha` increases, and we are interested in knowing -:math:`w` across a range of :math:`\\alpha` values. This is known -as the **regularization path**: a list of :math:`w` values corresponding to -different :math:`\\alpha` values, ranging from small to large. -In this example, we plot the regularization paths to show how the sizes of the -coefficients change as the regularization parameter increases. -""" -# %% -import matplotlib.pyplot as plt - -from sklearn.datasets import load_diabetes -from sklearn.linear_model import enet_path, lars_path, lasso_path - -# %% -# The Diabetes Dataset -# -------------------- -# -# We use the :func:`diabetes dataset ` to plot -# the regression coefficients for Lasso and Elastic Net. - -X, y = load_diabetes(return_X_y=True) -X /= X.std(axis=0) # Standardize data (easier to set the l1_ratio parameter) - -# %% -# Scikit-learn provides the following functions to compute multiple -# :math:`w` values for various :math:`\alpha` values efficiently: -# -# - :func:`~sklearn.linear_model.lasso_path` -# - :func:`~sklearn.linear_model.lars_path` -# - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.8` -# - -eps = 5e-3 # the smaller it is the longer is the path - -alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps) - -alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.8) - -# %% -# The :func:`~sklearn.linear_model.lasso_path` and -# :func:`~sklearn.linear_model.enet_path` functions compute -# :math:`w` with coordinate decent: for each entry of :math:`w`, -# the function solves for it optimal value while keeping the others -# fixed. Since the algorithm iterates until convergence, -# Lasso doesn't operate in a fixed number of steps based solely -# on the dataset's size, which can make it take longer to run. -# In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps. - -alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso") - -# %% -# The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm -# (see [1]_) to compute the Lasso solution in -# :math:`\min \left\{ -# n_{\operatorname{sample}}-1,n_{\operatorname{feature}} -# \right\}` -# steps. This provides an efficient algorithm for computing the entire Lasso path, and -# is implemented as :func:`~sklearn.linear_model.LassoLars` -# and :func:`~sklearn.linear_model.lars_path`. -# -# We now present the visualisation of the regularization paths for the diabetes dataset. -# Each model is represented by 10 curves, corresponding to the number of features in the -# dataset. Each curve shows how a particular coefficient :math:`w_i` changes as -# :math:`\alpha` increases. -# -# Lasso vs Lasso-LARS -# ------------------- -# In the "Lasso vs LARS Paths" visual, - -plt.figure(1) -for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars): - l1 = plt.semilogx(alphas_lasso, coef_lasso, c="#0072B2") - l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c="#D55E00") - -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Lasso vs LARS Paths") -plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right") -plt.axis("tight") -_ = plt.show() - -# %% -# the Lasso and LARS paths appear identical because both models solve -# the same constrained problem. However, LARS reaches the solution faster than Lasso. -# -# Lasso vs Elastic-Net -# -------------------- -# The "Lasso vs Elastic-Net Paths" visual is more notable. - -plt.figure(2) -for coef_l, coef_e in zip(coefs_lasso, coefs_enet): - l1 = plt.semilogx(alphas_lasso, coef_l, c="#0072B2") - l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c="#D55E00") - -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Lasso vs Elastic-Net Paths") -plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right") -plt.axis("tight") -_ = plt.show() - -# %% -# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso. -# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards the -# end. This demonstrates how the :math:`\ell^1`-norm constraint encourages sparsity in -# the solution, while combining it with the :math:`\ell^2`-norm provides a balanced -# compromise. -# -# .. rubric:: References -# -# .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R., -# Friedman J., Algorithm 3.2, p. 74, 2008. diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py deleted file mode 100644 index f642dfade5db8..0000000000000 --- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -============================================== -L1 Penalty and Sparsity in Logistic Regression -============================================== - -Comparison of the sparsity (percentage of zero coefficients) of solutions when -L1, L2 and Elastic-Net penalty are used for different values of C. We can see -that large values of C give more freedom to the model. Conversely, smaller -values of C constrain the model more. In the L1 penalty case, this leads to -sparser solutions. As expected, the Elastic-Net penalty sparsity is between -that of L1 and L2. - -We classify 8x8 images of digits into two classes: 0-4 against 5-9. -The visualization shows coefficients of the models for varying C. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -import matplotlib.pyplot as plt -import numpy as np - -from sklearn import datasets -from sklearn.linear_model import LogisticRegression -from sklearn.preprocessing import StandardScaler - -X, y = datasets.load_digits(return_X_y=True) - -X = StandardScaler().fit_transform(X) - -# classify small against large digits -y = (y > 4).astype(int) - -l1_ratio = 0.5 # L1 weight in the Elastic-Net regularization - -fig, axes = plt.subplots(3, 3) - -# Set regularization parameter -for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)): - # Increase tolerance for short training time - clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga") - clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga") - clf_en_LR = LogisticRegression( - C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01 - ) - clf_l1_LR.fit(X, y) - clf_l2_LR.fit(X, y) - clf_en_LR.fit(X, y) - - coef_l1_LR = clf_l1_LR.coef_.ravel() - coef_l2_LR = clf_l2_LR.coef_.ravel() - coef_en_LR = clf_en_LR.coef_.ravel() - - # coef_l1_LR contains zeros due to the - # L1 sparsity inducing norm - - sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100 - sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100 - sparsity_en_LR = np.mean(coef_en_LR == 0) * 100 - - print(f"C={C:.2f}") - print(f"{'Sparsity with L1 penalty:':<40} {sparsity_l1_LR:.2f}%") - print(f"{'Sparsity with Elastic-Net penalty:':<40} {sparsity_en_LR:.2f}%") - print(f"{'Sparsity with L2 penalty:':<40} {sparsity_l2_LR:.2f}%") - print(f"{'Score with L1 penalty:':<40} {clf_l1_LR.score(X, y):.2f}") - print(f"{'Score with Elastic-Net penalty:':<40} {clf_en_LR.score(X, y):.2f}") - print(f"{'Score with L2 penalty:':<40} {clf_l2_LR.score(X, y):.2f}") - - if i == 0: - axes_row[0].set_title("L1 penalty") - axes_row[1].set_title("Elastic-Net\nl1_ratio = %s" % l1_ratio) - axes_row[2].set_title("L2 penalty") - - for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]): - ax.imshow( - np.abs(coefs.reshape(8, 8)), - interpolation="nearest", - cmap="binary", - vmax=1, - vmin=0, - ) - ax.set_xticks(()) - ax.set_yticks(()) - - axes_row[0].set_ylabel(f"C = {C}") - -plt.show() diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py deleted file mode 100644 index 46608f683740e..0000000000000 --- a/examples/linear_model/plot_logistic_path.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -============================================== -Regularization path of L1- Logistic Regression -============================================== - - -Train l1-penalized logistic regression models on a binary classification -problem derived from the Iris dataset. - -The models are ordered from strongest regularized to least regularized. The 4 -coefficients of the models are collected and plotted as a "regularization -path": on the left-hand side of the figure (strong regularizers), all the -coefficients are exactly 0. When regularization gets progressively looser, -coefficients can get non-zero values one after the other. - -Here we choose the liblinear solver because it can efficiently optimize for the -Logistic Regression loss with a non-smooth, sparsity inducing l1 penalty. - -Also note that we set a low value for the tolerance to make sure that the model -has converged before collecting the coefficients. - -We also use warm_start=True which means that the coefficients of the models are -reused to initialize the next model fit to speed-up the computation of the -full-path. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -# %% -# Load data -# --------- - -from sklearn import datasets - -iris = datasets.load_iris() -X = iris.data -y = iris.target -feature_names = iris.feature_names - -# %% -# Here we remove the third class to make the problem a binary classification -X = X[y != 2] -y = y[y != 2] - -# %% -# Compute regularization path -# --------------------------- - -import numpy as np - -from sklearn.linear_model import LogisticRegression -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.svm import l1_min_c - -cs = l1_min_c(X, y, loss="log") * np.logspace(0, 1, 16) - -# %% -# Create a pipeline with `StandardScaler` and `LogisticRegression`, to normalize -# the data before fitting a linear model, in order to speed-up convergence and -# make the coefficients comparable. Also, as a side effect, since the data is now -# centered around 0, we don't need to fit an intercept. -clf = make_pipeline( - StandardScaler(), - LogisticRegression( - penalty="l1", - solver="liblinear", - tol=1e-6, - max_iter=int(1e6), - warm_start=True, - fit_intercept=False, - ), -) -coefs_ = [] -for c in cs: - clf.set_params(logisticregression__C=c) - clf.fit(X, y) - coefs_.append(clf["logisticregression"].coef_.ravel().copy()) - -coefs_ = np.array(coefs_) - -# %% -# Plot regularization path -# ------------------------ - -import matplotlib.pyplot as plt - -# Colorblind-friendly palette (IBM Color Blind Safe palette) -colors = ["#648FFF", "#785EF0", "#DC267F", "#FE6100"] - -plt.figure(figsize=(10, 6)) -for i in range(coefs_.shape[1]): - plt.semilogx(cs, coefs_[:, i], marker="o", color=colors[i], label=feature_names[i]) - -ymin, ymax = plt.ylim() -plt.xlabel("C") -plt.ylabel("Coefficients") -plt.title("Logistic Regression Path") -plt.legend() -plt.axis("tight") -plt.show() diff --git a/examples/linear_model/plot_ridge_coeffs.py b/examples/linear_model/plot_ridge_coeffs.py deleted file mode 100644 index 1ad7962f8bfa3..0000000000000 --- a/examples/linear_model/plot_ridge_coeffs.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -========================================================= -Ridge coefficients as a function of the L2 Regularization -========================================================= - -A model that overfits learns the training data too well, capturing both the -underlying patterns and the noise in the data. However, when applied to unseen -data, the learned associations may not hold. We normally detect this when we -apply our trained predictions to the test data and see the statistical -performance drop significantly compared to the training data. - -One way to overcome overfitting is through regularization, which can be done by -penalizing large weights (coefficients) in linear models, forcing the model to -shrink all coefficients. Regularization reduces a model's reliance on specific -information obtained from the training samples. - -This example illustrates how L2 regularization in a -:class:`~sklearn.linear_model.Ridge` regression affects a model's performance by -adding a penalty term to the loss that increases with the coefficients -:math:`\\beta`. - -The regularized loss function is given by: :math:`\\mathcal{L}(X, y, \\beta) = -\\| y - X \\beta \\|^{2}_{2} + \\alpha \\| \\beta \\|^{2}_{2}` - -where :math:`X` is the input data, :math:`y` is the target variable, -:math:`\\beta` is the vector of coefficients associated with the features, and -:math:`\\alpha` is the regularization strength. - -The regularized loss function aims to balance the trade-off between accurately -predicting the training set and to prevent overfitting. - -In this regularized loss, the left-hand side (e.g. :math:`\\|y - -X\\beta\\|^{2}_{2}`) measures the squared difference between the actual target -variable, :math:`y`, and the predicted values. Minimizing this term alone could -lead to overfitting, as the model may become too complex and sensitive to noise -in the training data. - -To address overfitting, Ridge regularization adds a constraint, called a penalty -term, (:math:`\\alpha \\| \\beta\\|^{2}_{2}`) to the loss function. This penalty -term is the sum of the squares of the model's coefficients, multiplied by the -regularization strength :math:`\\alpha`. By introducing this constraint, Ridge -regularization discourages any single coefficient :math:`\\beta_{i}` from taking -an excessively large value and encourages smaller and more evenly distributed -coefficients. Higher values of :math:`\\alpha` force the coefficients towards -zero. However, an excessively high :math:`\\alpha` can result in an underfit -model that fails to capture important patterns in the data. - -Therefore, the regularized loss function combines the prediction accuracy term -and the penalty term. By adjusting the regularization strength, practitioners -can fine-tune the degree of constraint imposed on the weights, training a model -capable of generalizing well to unseen data while avoiding overfitting. -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -# %% -# Purpose of this example -# ----------------------- -# For the purpose of showing how Ridge regularization works, we will create a -# non-noisy data set. Then we will train a regularized model on a range of -# regularization strengths (:math:`\alpha`) and plot how the trained -# coefficients and the mean squared error between those and the original values -# behave as functions of the regularization strength. -# -# Creating a non-noisy data set -# ***************************** -# We make a toy data set with 100 samples and 10 features, that's suitable to -# detect regression. Out of the 10 features, 8 are informative and contribute to -# the regression, while the remaining 2 features do not have any effect on the -# target variable (their true coefficients are 0). Please note that in this -# example the data is non-noisy, hence we can expect our regression model to -# recover exactly the true coefficients w. -from sklearn.datasets import make_regression - -X, y, w = make_regression( - n_samples=100, n_features=10, n_informative=8, coef=True, random_state=1 -) - -# Obtain the true coefficients -print(f"The true coefficient of this regression problem are:\n{w}") - -# %% -# Training the Ridge Regressor -# **************************** -# We use :class:`~sklearn.linear_model.Ridge`, a linear model with L2 -# regularization. We train several models, each with a different value for the -# model parameter `alpha`, which is a positive constant that multiplies the -# penalty term, controlling the regularization strength. For each trained model -# we then compute the error between the true coefficients `w` and the -# coefficients found by the model `clf`. We store the identified coefficients -# and the calculated errors for the corresponding coefficients in lists, which -# makes it convenient for us to plot them. -import numpy as np - -from sklearn.linear_model import Ridge -from sklearn.metrics import mean_squared_error - -clf = Ridge() - -# Generate values for `alpha` that are evenly distributed on a logarithmic scale -alphas = np.logspace(-3, 4, 200) -coefs = [] -errors_coefs = [] - -# Train the model with different regularisation strengths -for a in alphas: - clf.set_params(alpha=a).fit(X, y) - coefs.append(clf.coef_) - errors_coefs.append(mean_squared_error(clf.coef_, w)) - -# %% -# Plotting trained Coefficients and Mean Squared Errors -# ***************************************************** -# We now plot the 10 different regularized coefficients as a function of the -# regularization parameter `alpha` where each color represents a different -# coefficient. -# -# On the right-hand-side, we plot how the errors of the coefficients from the -# estimator change as a function of regularization. -import matplotlib.pyplot as plt -import pandas as pd - -alphas = pd.Index(alphas, name="alpha") -coefs = pd.DataFrame(coefs, index=alphas, columns=[f"Feature {i}" for i in range(10)]) -errors = pd.Series(errors_coefs, index=alphas, name="Mean squared error") - -fig, axs = plt.subplots(1, 2, figsize=(20, 6)) - -coefs.plot( - ax=axs[0], - logx=True, - title="Ridge coefficients as a function of the regularization strength", -) -axs[0].set_ylabel("Ridge coefficient values") -errors.plot( - ax=axs[1], - logx=True, - title="Coefficient error as a function of the regularization strength", -) -_ = axs[1].set_ylabel("Mean squared error") -# %% -# Interpreting the plots -# ********************** -# The plot on the left-hand side shows how the regularization strength (`alpha`) -# affects the Ridge regression coefficients. Smaller values of `alpha` (weak -# regularization), allow the coefficients to closely resemble the true -# coefficients (`w`) used to generate the data set. This is because no -# additional noise was added to our artificial data set. As `alpha` increases, -# the coefficients shrink towards zero, gradually reducing the impact of the -# features that were formerly more significant. -# -# The right-hand side plot shows the mean squared error (MSE) between the -# coefficients found by the model and the true coefficients (`w`). It provides a -# measure that relates to how exact our ridge model is in comparison to the true -# generative model. A low error means that it found coefficients closer to the -# ones of the true generative model. In this case, since our toy data set was -# non-noisy, we can see that the least regularized model retrieves coefficients -# closest to the true coefficients (`w`) (error is close to 0). -# -# When `alpha` is small, the model captures the intricate details of the -# training data, whether those were caused by noise or by actual information. As -# `alpha` increases, the highest coefficients shrink more rapidly, rendering -# their corresponding features less influential in the training process. This -# can enhance a model's ability to generalize to unseen data (if there was a lot -# of noise to capture), but it also poses the risk of losing performance if the -# regularization becomes too strong compared to the amount of noise the data -# contained (as in this example). -# -# In real-world scenarios where data typically includes noise, selecting an -# appropriate `alpha` value becomes crucial in striking a balance between an -# overfitting and an underfitting model. -# -# Here, we saw that :class:`~sklearn.linear_model.Ridge` adds a penalty to the -# coefficients to fight overfitting. Another problem that occurs is linked to -# the presence of outliers in the training dataset. An outlier is a data point -# that differs significantly from other observations. Concretely, these outliers -# impact the left-hand side term of the loss function that we showed earlier. -# Some other linear models are formulated to be robust to outliers such as the -# :class:`~sklearn.linear_model.HuberRegressor`. You can learn more about it in -# the :ref:`sphx_glr_auto_examples_linear_model_plot_huber_vs_ridge.py` example. diff --git a/examples/linear_model/plot_ridge_path.py b/examples/linear_model/plot_ridge_path.py deleted file mode 100644 index d3c19acd9e18c..0000000000000 --- a/examples/linear_model/plot_ridge_path.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -=========================================================== -Plot Ridge coefficients as a function of the regularization -=========================================================== - -Shows the effect of collinearity in the coefficients of an estimator. - -.. currentmodule:: sklearn.linear_model - -:class:`Ridge` Regression is the estimator used in this example. -Each color represents a different feature of the -coefficient vector, and this is displayed as a function of the -regularization parameter. - -This example also shows the usefulness of applying Ridge regression -to highly ill-conditioned matrices. For such matrices, a slight -change in the target variable can cause huge variances in the -calculated weights. In such cases, it is useful to set a certain -regularization (alpha) to reduce this variation (noise). - -When alpha is very large, the regularization effect dominates the -squared loss function and the coefficients tend to zero. -At the end of the path, as alpha tends toward zero -and the solution tends towards the ordinary least squares, coefficients -exhibit big oscillations. In practise it is necessary to tune alpha -in such a way that a balance is maintained between both. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -import matplotlib.pyplot as plt -import numpy as np - -from sklearn import linear_model - -# X is the 10x10 Hilbert matrix -X = 1.0 / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis]) -y = np.ones(10) - -# %% -# Compute paths -# ------------- - -n_alphas = 200 -alphas = np.logspace(-10, -2, n_alphas) - -coefs = [] -for a in alphas: - ridge = linear_model.Ridge(alpha=a, fit_intercept=False) - ridge.fit(X, y) - coefs.append(ridge.coef_) - -# %% -# Display results -# --------------- - -ax = plt.gca() - -ax.plot(alphas, coefs) -ax.set_xscale("log") -ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis -plt.xlabel("alpha") -plt.ylabel("weights") -plt.title("Ridge coefficients as a function of the regularization") -plt.axis("tight") -plt.show() diff --git a/examples/linear_model/plot_sgd_penalties.py b/examples/linear_model/plot_sgd_penalties.py deleted file mode 100644 index 6f8830b52fe7a..0000000000000 --- a/examples/linear_model/plot_sgd_penalties.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -============== -SGD: Penalties -============== - -Contours of where the penalty is equal to 1 -for the three penalties L1, L2 and elastic-net. - -All of the above are supported by :class:`~sklearn.linear_model.SGDClassifier` -and :class:`~sklearn.linear_model.SGDRegressor`. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -import matplotlib.pyplot as plt -import numpy as np - -l1_color = "navy" -l2_color = "c" -elastic_net_color = "darkorange" - -line = np.linspace(-1.5, 1.5, 1001) -xx, yy = np.meshgrid(line, line) - -l2 = xx**2 + yy**2 -l1 = np.abs(xx) + np.abs(yy) -rho = 0.5 -elastic_net = rho * l1 + (1 - rho) * l2 - -plt.figure(figsize=(10, 10), dpi=100) -ax = plt.gca() - -elastic_net_contour = plt.contour( - xx, yy, elastic_net, levels=[1], colors=elastic_net_color -) -l2_contour = plt.contour(xx, yy, l2, levels=[1], colors=l2_color) -l1_contour = plt.contour(xx, yy, l1, levels=[1], colors=l1_color) -ax.set_aspect("equal") -ax.spines["left"].set_position("center") -ax.spines["right"].set_color("none") -ax.spines["bottom"].set_position("center") -ax.spines["top"].set_color("none") - -plt.clabel( - elastic_net_contour, - inline=1, - fontsize=18, - fmt={1.0: "elastic-net"}, - manual=[(-1, -1)], -) -plt.clabel(l2_contour, inline=1, fontsize=18, fmt={1.0: "L2"}, manual=[(-1, -1)]) -plt.clabel(l1_contour, inline=1, fontsize=18, fmt={1.0: "L1"}, manual=[(-1, -1)]) - -plt.tight_layout() -plt.show() diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index b98cf08925910..5c43c8b04ec20 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -319,8 +319,8 @@ def lasso_path( Notes ----- For an example, see - :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py - `. + :ref:`examples/linear_model/plot_regularization.py + `. To avoid unnecessary memory duplication the X argument of the fit method should be directly passed as a Fortran-contiguous numpy array. @@ -524,8 +524,8 @@ def enet_path( Notes ----- For an example, see - :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py - `. + :ref:`examples/linear_model/plot_regularization.py + `. Examples --------