From 971478b1f7209aa8e265a64b2a894ed6b7041a40 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 7 Oct 2024 11:44:55 -0700
Subject: [PATCH 01/13] DOC rework the example presenting the regularization
 path of Lasso, Lasso-LARS, and Elastic Net

---
 .../plot_lasso_lasso_lars_elasticnet_path.py  | 164 +++++++++++++-----
 1 file changed, 116 insertions(+), 48 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index 44ae64c4c2811..3a8292b439dd7 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -4,31 +4,104 @@
 ========================================
 
 This example shows how to compute the "paths" of coefficients along the Lasso,
-Lasso-LARS, and Elastic Net regularization paths. In other words, it shows the
-relationship between the regularization parameter (alpha) and the coefficients.
+Lasso-LARS, and Elastic Net regularization paths. It illustrates the
+relationship between the regularization parameter :math:`\\alpha`
+and the coefficients :math:`w`.
+
+When performing linear regression on a given dataset
+:math:`(X, y)`, regularization terms can be added to
+control the model's complexity.
+Scikit-learn provides the following regularization techniques:
+
+- :func:`~sklearn.linear_model.Lasso`
+- :func:`~sklearn.linear_model.LassoLars`
+- :func:`~sklearn.linear_model.ElasticNet` with default `l1_ratio=0.5`
+
+Mathematically, these are formulated by minimising the constrained
+least-squares penalty:
+
+.. math::
+
+    \\min_{w} \\frac{1}{2n_{\\operatorname{sample}}}
+    \\vert \\vert Xw - y \\vert \\vert^2_2 +
+    \\left\\{
+    \\begin{array}{cl}
+    \\alpha \\vert \\vert w \\vert \\vert_1 & \\mbox{Lasso(-LARS)} \\\\
+    \\frac{\\alpha}{2} \\vert \\vert w \\vert \\vert_1  +
+    \\frac{\\alpha}{4} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\
+    \\end{array}
+    \\right.
+
+Thus, the Lasso model includes the :math:`\\ell^1`-norm of the regression
+coefficients in the penalty, while the Elastic Net model
+incorporates both :math:`\\ell^1`- and :math:`\\ell^2`-norms.
+
+Any solution to this optimisation problem depends on :math:`\\alpha`.
+For example, in Lasso, a large :math:`\\alpha` forces the least-squares
+penalty to stay small, which in turn keeps the norm
+:math:`\\vert \\vert w \\vert \\vert_1`
+small. Conversely, a smaller :math:`\\alpha` allows the norm
+:math:`\\vert \\vert w \\vert \\vert_1`
+to grow larger.
+
+This suggests that the regression coefficients :math:`w` evolve as
+:math:`\\alpha` increases, and we are interested in knowing
+:math:`w` across a range of :math:`\\alpha` values. This is known
+as the **regularization path**: a list of :math:`w` values corresponding to
+different :math:`\\alpha` values, ranging from small to large.
+In this example, we plot the regularization paths to show how the sizes of the
+coefficients change as the regularization parameter increases.
+
+We use the diabetes dataset :func:`~sklearn.datasets.load_diabetes` to plot
+the regression coefficients for Lasso and Elastic Net. Scikit-learn provides the
+following functions to compute multiple :math:`w` values for various
+:math:`\\alpha` values efficiently:
+
+- :func:`~sklearn.linear_model.lasso_path`
+- :func:`~sklearn.linear_model.lars_path`
+- :func:`~sklearn.linear_model.enet_path`
+
+The :func:`~sklearn.linear_model.lasso_path` and
+:func:`~sklearn.linear_model.enet_path` functions compute
+:math:`w` with coordinate decent: for each entry of :math:`w`,
+the function solves for it optimal value while keeping the others
+fixed. Since the algorithm iterates until convergence,
+Lasso doesn't operate in a fixed number of steps based solely
+on the dataset's size, which can make it take longer to run.
+In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps.
+
+The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm
+(see [1]_) to compute the Lasso solution in
+:math:`\\min \\left\\{
+n_{\\operatorname{sample}}-1,n_{\\operatorname{feature}}
+\\right\\}`
+steps. This provides an efficient algorithm for computing the entire Lasso path, and
+is implemented as :func:`~sklearn.linear_model.LassoLars`
+and :func:`~sklearn.linear_model.lars_path`.
+
+We now present the visualisation of the regularization path for the diabetes dataset.
+Each model is represented by 10 curves, corresponding to the number of features in the
+dataset. Each curve shows how a particular coefficient :math:`w_i` changes as
+:math:`\\alpha` increases.
+
+- In the "Lasso vs LARS Paths" visual, the Lasso and LARS paths appear identical because
+  both models solve the same constrained problem. However, LARS reaches the solution
+  faster than Lasso.
+
+- The "Lasso vs Elastic-Net Paths" visual is more notable. Elastic Net's coefficients
+  tend to have smaller absolute values than those of Lasso. Additionally, Elastic Net
+  maintains more non-zero coefficients than Lasso towards the end. This demonstrates
+  how the :math:`\\ell^1`-norm constraint encourages sparsity in the solution, while
+  combining it with the :math:`\\ell^2`-norm provides a balanced compromise.
+
+
+
+.. rubric:: References
+
+.. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
+       Friedman J., Algorithm 3.2, p. 74, 2008.
 
-Lasso and Lasso-LARS impose a sparsity constraint on the coefficients,
-encouraging some of them to be zero. Elastic Net is a generalization of
-Lasso that adds an L2 penalty term to the L1 penalty term. This allows for
-some coefficients to be non-zero while still encouraging sparsity.
 
-Lasso and Elastic Net use a coordinate descent method to compute the paths, while
-Lasso-LARS uses the LARS algorithm to compute the paths.
-
-The paths are computed using :func:`~sklearn.linear_model.lasso_path`,
-:func:`~sklearn.linear_model.lars_path`, and :func:`~sklearn.linear_model.enet_path`.
-
-The results show different comparison plots:
-
-- Compare Lasso and Lasso-LARS
-- Compare Lasso and Elastic Net
-- Compare Lasso with positive Lasso
-- Compare LARS and Positive LARS
-- Compare Elastic Net and positive Elastic Net
-
-Each plot shows how the model coefficients vary as the regularization strength changes,
-offering insight into the behavior of these models
-under different constraints.
 """
 
 # Authors: The scikit-learn developers
@@ -75,62 +148,57 @@
 # Display results
 
 plt.figure(1)
-colors = cycle(["b", "r", "g", "c", "k"])
-for coef_lasso, coef_lars, c in zip(coefs_lasso, coefs_lars, colors):
-    l1 = plt.semilogx(alphas_lasso, coef_lasso, c=c)
-    l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c=c)
+for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars):
+    l1 = plt.semilogx(alphas_lasso, coef_lasso, c='#0072B2')
+    l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c='#D55E00')
 
 plt.xlabel("alpha")
 plt.ylabel("coefficients")
-plt.title("Lasso and LARS Paths")
+plt.title("Lasso vs LARS Paths")
 plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right")
 plt.axis("tight")
 
 plt.figure(2)
-colors = cycle(["b", "r", "g", "c", "k"])
-for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors):
-    l1 = plt.semilogx(alphas_lasso, coef_l, c=c)
-    l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c=c)
+for coef_l, coef_e in zip(coefs_lasso, coefs_enet):
+    l1 = plt.semilogx(alphas_lasso, coef_l, c='#0072B2')
+    l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c='#D55E00')
 
 plt.xlabel("alpha")
 plt.ylabel("coefficients")
-plt.title("Lasso and Elastic-Net Paths")
+plt.title("Lasso vs Elastic-Net Paths")
 plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right")
 plt.axis("tight")
 
-
 plt.figure(3)
-for coef_l, coef_pl, c in zip(coefs_lasso, coefs_positive_lasso, colors):
-    l1 = plt.semilogy(alphas_lasso, coef_l, c=c)
-    l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c=c)
+for coef_l, coef_pl in zip(coefs_lasso, coefs_positive_lasso):
+    l1 = plt.semilogy(alphas_lasso, coef_l, c='#0072B2')
+    l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c='#D55E00')
 
 plt.xlabel("alpha")
 plt.ylabel("coefficients")
-plt.title("Lasso and positive Lasso")
+plt.title("Lasso vs Positive Lasso Paths")
 plt.legend((l1[-1], l2[-1]), ("Lasso", "positive Lasso"), loc="lower right")
 plt.axis("tight")
 
-
 plt.figure(4)
-colors = cycle(["b", "r", "g", "c", "k"])
-for coef_lars, coef_positive_lars, c in zip(coefs_lars, coefs_positive_lars, colors):
-    l1 = plt.semilogx(alphas_lars, coef_lars, c=c)
-    l2 = plt.semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c=c)
+for coef_lars, coef_positive_lars in zip(coefs_lars, coefs_positive_lars):
+    l1 = plt.semilogx(alphas_lars, coef_lars, c='#0072B2')
+    l2 = plt.semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c='#D55E00')
 
 plt.xlabel("alpha")
 plt.ylabel("coefficients")
-plt.title("LARS and Positive LARS")
+plt.title("LARS vs Positive LARS Paths")
 plt.legend((l1[-1], l2[-1]), ("LARS", "Positive LARS"), loc="lower right")
 plt.axis("tight")
 
 plt.figure(5)
-for coef_e, coef_pe, c in zip(coefs_enet, coefs_positive_enet, colors):
-    l1 = plt.semilogx(alphas_enet, coef_e, c=c)
-    l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c=c)
+for coef_e, coef_pe in zip(coefs_enet, coefs_positive_enet):
+    l1 = plt.semilogx(alphas_enet, coef_e, c='#0072B2')
+    l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c='#D55E00')
 
 plt.xlabel("alpha")
 plt.ylabel("coefficients")
-plt.title("Elastic-Net and positive Elastic-Net")
+plt.title("Elastic-Net vs Positive Elastic-Net Paths")
 plt.legend((l1[-1], l2[-1]), ("Elastic-Net", "positive Elastic-Net"), loc="lower right")
 plt.axis("tight")
 plt.show()

From a24f1859a21a452c2f527a016171a203f4d0a3ca Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 7 Oct 2024 12:03:32 -0700
Subject: [PATCH 02/13] Fix Linting

---
 .../linear_model/plot_lasso_lasso_lars_elasticnet_path.py   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index 3a8292b439dd7..7296da8ef9a05 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -107,8 +107,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-from itertools import cycle
-
 import matplotlib.pyplot as plt
 
 from sklearn.datasets import load_diabetes
@@ -183,7 +181,9 @@
 plt.figure(4)
 for coef_lars, coef_positive_lars in zip(coefs_lars, coefs_positive_lars):
     l1 = plt.semilogx(alphas_lars, coef_lars, c='#0072B2')
-    l2 = plt.semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c='#D55E00')
+    l2 = plt.semilogx(
+        alphas_positive_lars, coef_positive_lars, linestyle="--", c='#D55E00'
+    )
 
 plt.xlabel("alpha")
 plt.ylabel("coefficients")

From edb9dba2bb5b63b8b2a21488b1bb81acbec35ee6 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 7 Oct 2024 12:21:40 -0700
Subject: [PATCH 03/13] Fix Linting

---
 .../plot_lasso_lasso_lars_elasticnet_path.py  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index 7296da8ef9a05..ffea03ec22020 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -147,8 +147,8 @@
 
 plt.figure(1)
 for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars):
-    l1 = plt.semilogx(alphas_lasso, coef_lasso, c='#0072B2')
-    l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c='#D55E00')
+    l1 = plt.semilogx(alphas_lasso, coef_lasso, c="#0072B2")
+    l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c="#D55E00")
 
 plt.xlabel("alpha")
 plt.ylabel("coefficients")
@@ -158,8 +158,8 @@
 
 plt.figure(2)
 for coef_l, coef_e in zip(coefs_lasso, coefs_enet):
-    l1 = plt.semilogx(alphas_lasso, coef_l, c='#0072B2')
-    l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c='#D55E00')
+    l1 = plt.semilogx(alphas_lasso, coef_l, c="#0072B2")
+    l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c="#D55E00")
 
 plt.xlabel("alpha")
 plt.ylabel("coefficients")
@@ -169,8 +169,8 @@
 
 plt.figure(3)
 for coef_l, coef_pl in zip(coefs_lasso, coefs_positive_lasso):
-    l1 = plt.semilogy(alphas_lasso, coef_l, c='#0072B2')
-    l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c='#D55E00')
+    l1 = plt.semilogy(alphas_lasso, coef_l, c="#0072B2")
+    l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c="#D55E00")
 
 plt.xlabel("alpha")
 plt.ylabel("coefficients")
@@ -180,9 +180,9 @@
 
 plt.figure(4)
 for coef_lars, coef_positive_lars in zip(coefs_lars, coefs_positive_lars):
-    l1 = plt.semilogx(alphas_lars, coef_lars, c='#0072B2')
+    l1 = plt.semilogx(alphas_lars, coef_lars, c="#0072B2")
     l2 = plt.semilogx(
-        alphas_positive_lars, coef_positive_lars, linestyle="--", c='#D55E00'
+        alphas_positive_lars, coef_positive_lars, linestyle="--", c="#D55E00"
     )
 
 plt.xlabel("alpha")
@@ -193,8 +193,8 @@
 
 plt.figure(5)
 for coef_e, coef_pe in zip(coefs_enet, coefs_positive_enet):
-    l1 = plt.semilogx(alphas_enet, coef_e, c='#0072B2')
-    l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c='#D55E00')
+    l1 = plt.semilogx(alphas_enet, coef_e, c="#0072B2")
+    l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c="#D55E00")
 
 plt.xlabel("alpha")
 plt.ylabel("coefficients")

From baba6173488a4ee0b55a3986c307ddd94616d7a4 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 7 Oct 2024 12:40:13 -0700
Subject: [PATCH 04/13] Fix Linting

---
 .../linear_model/plot_lasso_lasso_lars_elasticnet_path.py  | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index ffea03ec22020..32a086ee51811 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -1,3 +1,6 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
 """
 ========================================
 Lasso, Lasso-LARS, and Elastic Net paths
@@ -101,12 +104,8 @@
 .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
        Friedman J., Algorithm 3.2, p. 74, 2008.
 
-
 """
 
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
 import matplotlib.pyplot as plt
 
 from sklearn.datasets import load_diabetes

From e697a16bad1c37343f8c052dab345b9b55cfa489 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 7 Oct 2024 14:38:51 -0700
Subject: [PATCH 05/13] Implemented notebook style.

---
 .../plot_lasso_lasso_lars_elasticnet_path.py  | 175 +++++++-----------
 1 file changed, 68 insertions(+), 107 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index 32a086ee51811..d554eb4310675 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -54,95 +54,70 @@
 different :math:`\\alpha` values, ranging from small to large.
 In this example, we plot the regularization paths to show how the sizes of the
 coefficients change as the regularization parameter increases.
-
-We use the diabetes dataset :func:`~sklearn.datasets.load_diabetes` to plot
-the regression coefficients for Lasso and Elastic Net. Scikit-learn provides the
-following functions to compute multiple :math:`w` values for various
-:math:`\\alpha` values efficiently:
-
-- :func:`~sklearn.linear_model.lasso_path`
-- :func:`~sklearn.linear_model.lars_path`
-- :func:`~sklearn.linear_model.enet_path`
-
-The :func:`~sklearn.linear_model.lasso_path` and
-:func:`~sklearn.linear_model.enet_path` functions compute
-:math:`w` with coordinate decent: for each entry of :math:`w`,
-the function solves for it optimal value while keeping the others
-fixed. Since the algorithm iterates until convergence,
-Lasso doesn't operate in a fixed number of steps based solely
-on the dataset's size, which can make it take longer to run.
-In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps.
-
-The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm
-(see [1]_) to compute the Lasso solution in
-:math:`\\min \\left\\{
-n_{\\operatorname{sample}}-1,n_{\\operatorname{feature}}
-\\right\\}`
-steps. This provides an efficient algorithm for computing the entire Lasso path, and
-is implemented as :func:`~sklearn.linear_model.LassoLars`
-and :func:`~sklearn.linear_model.lars_path`.
-
-We now present the visualisation of the regularization path for the diabetes dataset.
-Each model is represented by 10 curves, corresponding to the number of features in the
-dataset. Each curve shows how a particular coefficient :math:`w_i` changes as
-:math:`\\alpha` increases.
-
-- In the "Lasso vs LARS Paths" visual, the Lasso and LARS paths appear identical because
-  both models solve the same constrained problem. However, LARS reaches the solution
-  faster than Lasso.
-
-- The "Lasso vs Elastic-Net Paths" visual is more notable. Elastic Net's coefficients
-  tend to have smaller absolute values than those of Lasso. Additionally, Elastic Net
-  maintains more non-zero coefficients than Lasso towards the end. This demonstrates
-  how the :math:`\\ell^1`-norm constraint encourages sparsity in the solution, while
-  combining it with the :math:`\\ell^2`-norm provides a balanced compromise.
-
-
-
-.. rubric:: References
-
-.. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
-       Friedman J., Algorithm 3.2, p. 74, 2008.
-
 """
-
+# %%
 import matplotlib.pyplot as plt
 
 from sklearn.datasets import load_diabetes
 from sklearn.linear_model import enet_path, lars_path, lasso_path
 
+# %%
+# The Diabetes Dataset
+# ------------------
+#
+# We use the diabetes dataset :func:`~sklearn.datasets.load_diabetes` to plot
+# the regression coefficients for Lasso and Elastic Net.
+
 X, y = load_diabetes(return_X_y=True)
 X /= X.std(axis=0)  # Standardize data (easier to set the l1_ratio parameter)
 
-# Compute paths
+# %%
+# Scikit-learn provides the following functions to compute multiple
+# :math:`w` values for various :math:`\\alpha` values efficiently:
+#
+# - :func:`~sklearn.linear_model.lasso_path`
+# - :func:`~sklearn.linear_model.lars_path`
+# - :func:`~sklearn.linear_model.enet_path`
+#
 
 eps = 5e-3  # the smaller it is the longer is the path
 
-print("Computing regularization path using the lasso...")
 alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps)
 
-print("Computing regularization path using the positive lasso...")
-alphas_positive_lasso, coefs_positive_lasso, _ = lasso_path(
-    X, y, eps=eps, positive=True
-)
-
-print("Computing regularization path using the LARS...")
-alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso")
-
-print("Computing regularization path using the positive LARS...")
-alphas_positive_lars, _, coefs_positive_lars = lars_path(
-    X, y, method="lasso", positive=True
-)
-
-print("Computing regularization path using the elastic net...")
 alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.8)
 
-print("Computing regularization path using the positive elastic net...")
-alphas_positive_enet, coefs_positive_enet, _ = enet_path(
-    X, y, eps=eps, l1_ratio=0.8, positive=True
-)
+# %%
+# The :func:`~sklearn.linear_model.lasso_path` and
+# :func:`~sklearn.linear_model.enet_path` functions compute
+# :math:`w` with coordinate decent: for each entry of :math:`w`,
+# the function solves for it optimal value while keeping the others
+# fixed. Since the algorithm iterates until convergence,
+# Lasso doesn't operate in a fixed number of steps based solely
+# on the dataset's size, which can make it take longer to run.
+# In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps.
 
-# Display results
+alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso")
+
+# %%
+# The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm
+# (see [1]_) to compute the Lasso solution in
+# :math:`\\min \\left\\{
+# n_{\\operatorname{sample}}-1,n_{\\operatorname{feature}}
+# \\right\\}`
+# steps. This provides an efficient algorithm for computing the entire Lasso path, and
+# is implemented as :func:`~sklearn.linear_model.LassoLars`
+# and :func:`~sklearn.linear_model.lars_path`.
+#
+# We now present the visualisation of the regularization path for the diabetes dataset.
+# Each model is represented by 10 curves, corresponding to the number of features in the
+# dataset. Each curve shows how a particular coefficient :math:`w_i` changes as
+# :math:`\\alpha` increases.
+#
+# Lasso vs Lasso-LARS
+# -------------------
+# In the "Lasso vs LARS Paths" visual,
+#
+# .. _marginal_dependencies:
 
 plt.figure(1)
 for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars):
@@ -155,6 +130,16 @@
 plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right")
 plt.axis("tight")
 
+# %%
+# the Lasso and LARS paths appear identical because both models solve
+# the same constrained problem. However, LARS reaches the solution faster than Lasso.
+#
+# Lasso vs Elastic-Net
+# --------------------
+# The "Lasso vs Elastic-Net Paths" visual is more notable.
+#
+# .. _marginal_dependencies:
+
 plt.figure(2)
 for coef_l, coef_e in zip(coefs_lasso, coefs_enet):
     l1 = plt.semilogx(alphas_lasso, coef_l, c="#0072B2")
@@ -166,38 +151,14 @@
 plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right")
 plt.axis("tight")
 
-plt.figure(3)
-for coef_l, coef_pl in zip(coefs_lasso, coefs_positive_lasso):
-    l1 = plt.semilogy(alphas_lasso, coef_l, c="#0072B2")
-    l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c="#D55E00")
-
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Lasso vs Positive Lasso Paths")
-plt.legend((l1[-1], l2[-1]), ("Lasso", "positive Lasso"), loc="lower right")
-plt.axis("tight")
-
-plt.figure(4)
-for coef_lars, coef_positive_lars in zip(coefs_lars, coefs_positive_lars):
-    l1 = plt.semilogx(alphas_lars, coef_lars, c="#0072B2")
-    l2 = plt.semilogx(
-        alphas_positive_lars, coef_positive_lars, linestyle="--", c="#D55E00"
-    )
-
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("LARS vs Positive LARS Paths")
-plt.legend((l1[-1], l2[-1]), ("LARS", "Positive LARS"), loc="lower right")
-plt.axis("tight")
-
-plt.figure(5)
-for coef_e, coef_pe in zip(coefs_enet, coefs_positive_enet):
-    l1 = plt.semilogx(alphas_enet, coef_e, c="#0072B2")
-    l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c="#D55E00")
-
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Elastic-Net vs Positive Elastic-Net Paths")
-plt.legend((l1[-1], l2[-1]), ("Elastic-Net", "positive Elastic-Net"), loc="lower right")
-plt.axis("tight")
-plt.show()
+# %%
+# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso.
+# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards the
+# end. This demonstrates how the :math:`\\ell^1`-norm constraint encourages sparsity in
+# the solution, while combining it with the :math:`\\ell^2`-norm provides a balanced
+# compromise.
+#
+# .. rubric:: References
+#
+# .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
+#        Friedman J., Algorithm 3.2, p. 74, 2008.

From b24673db4883f663fea807629c3d303884b261c8 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 7 Oct 2024 16:00:40 -0700
Subject: [PATCH 06/13] Fix styling.

---
 .../plot_lasso_lasso_lars_elasticnet_path.py  | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index d554eb4310675..a74ce42a46e1d 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -63,7 +63,7 @@
 
 # %%
 # The Diabetes Dataset
-# ------------------
+# --------------------
 #
 # We use the diabetes dataset :func:`~sklearn.datasets.load_diabetes` to plot
 # the regression coefficients for Lasso and Elastic Net.
@@ -73,7 +73,7 @@
 
 # %%
 # Scikit-learn provides the following functions to compute multiple
-# :math:`w` values for various :math:`\\alpha` values efficiently:
+# :math:`w` values for various :math:`\alpha` values efficiently:
 #
 # - :func:`~sklearn.linear_model.lasso_path`
 # - :func:`~sklearn.linear_model.lars_path`
@@ -101,9 +101,9 @@
 # %%
 # The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm
 # (see [1]_) to compute the Lasso solution in
-# :math:`\\min \\left\\{
-# n_{\\operatorname{sample}}-1,n_{\\operatorname{feature}}
-# \\right\\}`
+# :math:`\min \left\{
+# n_{\operatorname{sample}}-1,n_{\operatorname{feature}}
+# \right\}`
 # steps. This provides an efficient algorithm for computing the entire Lasso path, and
 # is implemented as :func:`~sklearn.linear_model.LassoLars`
 # and :func:`~sklearn.linear_model.lars_path`.
@@ -111,13 +111,11 @@
 # We now present the visualisation of the regularization path for the diabetes dataset.
 # Each model is represented by 10 curves, corresponding to the number of features in the
 # dataset. Each curve shows how a particular coefficient :math:`w_i` changes as
-# :math:`\\alpha` increases.
+# :math:`\alpha` increases.
 #
 # Lasso vs Lasso-LARS
 # -------------------
 # In the "Lasso vs LARS Paths" visual,
-#
-# .. _marginal_dependencies:
 
 plt.figure(1)
 for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars):
@@ -129,6 +127,7 @@
 plt.title("Lasso vs LARS Paths")
 plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right")
 plt.axis("tight")
+_ = plt.show()
 
 # %%
 # the Lasso and LARS paths appear identical because both models solve
@@ -137,8 +136,6 @@
 # Lasso vs Elastic-Net
 # --------------------
 # The "Lasso vs Elastic-Net Paths" visual is more notable.
-#
-# .. _marginal_dependencies:
 
 plt.figure(2)
 for coef_l, coef_e in zip(coefs_lasso, coefs_enet):
@@ -150,12 +147,13 @@
 plt.title("Lasso vs Elastic-Net Paths")
 plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right")
 plt.axis("tight")
+_ = plt.show()
 
 # %%
 # Elastic Net's coefficients tend to have smaller absolute values than those of Lasso.
 # Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards the
-# end. This demonstrates how the :math:`\\ell^1`-norm constraint encourages sparsity in
-# the solution, while combining it with the :math:`\\ell^2`-norm provides a balanced
+# end. This demonstrates how the :math:`\ell^1`-norm constraint encourages sparsity in
+# the solution, while combining it with the :math:`\ell^2`-norm provides a balanced
 # compromise.
 #
 # .. rubric:: References

From 7fe7e30ba059594bd96076fb1b5f153b7d61f026 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 7 Oct 2024 17:05:56 -0700
Subject: [PATCH 07/13] Fix styling.

---
 examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index a74ce42a46e1d..5ab24db441e74 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -18,7 +18,7 @@
 
 - :func:`~sklearn.linear_model.Lasso`
 - :func:`~sklearn.linear_model.LassoLars`
-- :func:`~sklearn.linear_model.ElasticNet` with default `l1_ratio=0.5`
+- :func:`~sklearn.linear_model.ElasticNet`
 
 Mathematically, these are formulated by minimising the constrained
 least-squares penalty:

From f2d2460591652bc53426179e2b01da7c8fcac1d7 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Tue, 8 Oct 2024 17:33:50 -0700
Subject: [PATCH 08/13] Fix `l1_ratio`.

---
 .../linear_model/plot_lasso_lasso_lars_elasticnet_path.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index 5ab24db441e74..6bccdd00dd9e6 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -18,7 +18,7 @@
 
 - :func:`~sklearn.linear_model.Lasso`
 - :func:`~sklearn.linear_model.LassoLars`
-- :func:`~sklearn.linear_model.ElasticNet`
+- :func:`~sklearn.linear_model.ElasticNet` with `l1_ratio=0.8`
 
 Mathematically, these are formulated by minimising the constrained
 least-squares penalty:
@@ -30,8 +30,8 @@
     \\left\\{
     \\begin{array}{cl}
     \\alpha \\vert \\vert w \\vert \\vert_1 & \\mbox{Lasso(-LARS)} \\\\
-    \\frac{\\alpha}{2} \\vert \\vert w \\vert \\vert_1  +
-    \\frac{\\alpha}{4} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\
+    \\frac{4\\alpha}{5} \\vert \\vert w \\vert \\vert_1  +
+    \\frac{\\alpha}{10} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\
     \\end{array}
     \\right.
 
@@ -77,7 +77,7 @@
 #
 # - :func:`~sklearn.linear_model.lasso_path`
 # - :func:`~sklearn.linear_model.lars_path`
-# - :func:`~sklearn.linear_model.enet_path`
+# - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.8`
 #
 
 eps = 5e-3  # the smaller it is the longer is the path

From a2e31f0443a81038de034b9182e5325b363c0842 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 18 Nov 2024 23:45:53 +0800
Subject: [PATCH 09/13] Fix typos.

---
 .../linear_model/plot_lasso_lasso_lars_elasticnet_path.py   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index 6bccdd00dd9e6..d55d2e921dff6 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -6,7 +6,7 @@
 Lasso, Lasso-LARS, and Elastic Net paths
 ========================================
 
-This example shows how to compute the "paths" of coefficients along the Lasso,
+This example shows how to compute the Lasso,
 Lasso-LARS, and Elastic Net regularization paths. It illustrates the
 relationship between the regularization parameter :math:`\\alpha`
 and the coefficients :math:`w`.
@@ -65,7 +65,7 @@
 # The Diabetes Dataset
 # --------------------
 #
-# We use the diabetes dataset :func:`~sklearn.datasets.load_diabetes` to plot
+# We use the :func:`diabetes dataset <sklearn.datasets.load_diabetes>` to plot
 # the regression coefficients for Lasso and Elastic Net.
 
 X, y = load_diabetes(return_X_y=True)
@@ -108,7 +108,7 @@
 # is implemented as :func:`~sklearn.linear_model.LassoLars`
 # and :func:`~sklearn.linear_model.lars_path`.
 #
-# We now present the visualisation of the regularization path for the diabetes dataset.
+# We now present the visualisation of the regularization paths for the diabetes dataset.
 # Each model is represented by 10 curves, corresponding to the number of features in the
 # dataset. Each curve shows how a particular coefficient :math:`w_i` changes as
 # :math:`\alpha` increases.

From 5bcfaf8f356378150551f62b1b861d672d3474e0 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Wed, 12 Feb 2025 21:19:35 -0800
Subject: [PATCH 10/13] update 'plot_lasso_lasso_lars_elasticnet_path.py'

---
 .../plot_lasso_lasso_lars_elasticnet_path.py  | 503 ++++++++++++++----
 1 file changed, 401 insertions(+), 102 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index d55d2e921dff6..0fe8534e0fd78 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -2,76 +2,153 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 """
-========================================
-Lasso, Lasso-LARS, and Elastic Net paths
-========================================
-
-This example shows how to compute the Lasso,
-Lasso-LARS, and Elastic Net regularization paths. It illustrates the
-relationship between the regularization parameter :math:`\\alpha`
-and the coefficients :math:`w`.
-
-When performing linear regression on a given dataset
-:math:`(X, y)`, regularization terms can be added to
-control the model's complexity.
-Scikit-learn provides the following regularization techniques:
-
-- :func:`~sklearn.linear_model.Lasso`
-- :func:`~sklearn.linear_model.LassoLars`
-- :func:`~sklearn.linear_model.ElasticNet` with `l1_ratio=0.8`
-
-Mathematically, these are formulated by minimising the constrained
-least-squares penalty:
-
-.. math::
-
-    \\min_{w} \\frac{1}{2n_{\\operatorname{sample}}}
-    \\vert \\vert Xw - y \\vert \\vert^2_2 +
-    \\left\\{
-    \\begin{array}{cl}
-    \\alpha \\vert \\vert w \\vert \\vert_1 & \\mbox{Lasso(-LARS)} \\\\
-    \\frac{4\\alpha}{5} \\vert \\vert w \\vert \\vert_1  +
-    \\frac{\\alpha}{10} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\
-    \\end{array}
-    \\right.
-
-Thus, the Lasso model includes the :math:`\\ell^1`-norm of the regression
-coefficients in the penalty, while the Elastic Net model
-incorporates both :math:`\\ell^1`- and :math:`\\ell^2`-norms.
-
-Any solution to this optimisation problem depends on :math:`\\alpha`.
-For example, in Lasso, a large :math:`\\alpha` forces the least-squares
-penalty to stay small, which in turn keeps the norm
-:math:`\\vert \\vert w \\vert \\vert_1`
-small. Conversely, a smaller :math:`\\alpha` allows the norm
-:math:`\\vert \\vert w \\vert \\vert_1`
-to grow larger.
-
-This suggests that the regression coefficients :math:`w` evolve as
-:math:`\\alpha` increases, and we are interested in knowing
-:math:`w` across a range of :math:`\\alpha` values. This is known
-as the **regularization path**: a list of :math:`w` values corresponding to
-different :math:`\\alpha` values, ranging from small to large.
-In this example, we plot the regularization paths to show how the sizes of the
-coefficients change as the regularization parameter increases.
+=================================================
+Regularization in Linear and Logistic Regressions
+=================================================
+
+This example explores regularization techniques for linear and logistic regression
+in both regression and classification tasks. It demonstrates how the
+regularization parameter :math:`\\alpha` can be adjusted to control the complexity
+of the trained coefficients :math:`w` and reduce overfitting.
 """
-# %%
+
 import matplotlib.pyplot as plt
+import numpy as np
 
-from sklearn.datasets import load_diabetes
-from sklearn.linear_model import enet_path, lars_path, lasso_path
+from sklearn.datasets import make_classification, make_regression
+from sklearn.linear_model import (
+    LogisticRegression,
+    Ridge,
+    enet_path,
+    lars_path,
+    lasso_path,
+)
+from sklearn.metrics import mean_squared_error
+from sklearn.svm import l1_min_c
 
 # %%
-# The Diabetes Dataset
-# --------------------
 #
-# We use the :func:`diabetes dataset <sklearn.datasets.load_diabetes>` to plot
-# the regression coefficients for Lasso and Elastic Net.
+# Regularization in Linear Regression
+# -----------------------------------
+#
+# When performing linear regression on a given dataset
+# :math:`(X, y)`, regularization terms can be added to
+# control the model's complexity and mitigate overfitting.
+# Scikit-learn provides the following regularization techniques:
+#
+# - :func:`~sklearn.linear_model.Lasso`
+# - :func:`~sklearn.linear_model.LassoLars`
+# - :func:`~sklearn.linear_model.Ridge`
+# - :func:`~sklearn.linear_model.ElasticNet` (with `l1_ratio=0.8`
+#   in this example)
+#
+# Mathematically, these are formulated by minimizing the constrained
+# least-squares penalty:
+#
+# .. math::
+#
+#     \min_{w} \frac{1}{2n_{\operatorname{sample}}}
+#     \Vert Xw - y \Vert^2_2 +
+#     \left\{
+#     \begin{array}{cl}
+#     \alpha \Vert w \Vert_1 & \mbox{Lasso(-LARS)} \\
+#     \alpha \Vert w \Vert_2^2 & \mbox{Ridge} \\
+#     \frac{4\alpha}{5} \Vert w \Vert_1  +
+#     \frac{\alpha}{10} \Vert w \Vert^2_2& \mbox{Elastic Net} \\
+#     \end{array}
+#     \right.
+#
+# Thus, the Lasso model (resp. Ridge model) includes the :math:`\ell^1`-norm
+# (resp. :math:`\ell^2`-norm) of the regression
+# coefficients in the penalty, while the Elastic Net model
+# incorporates both :math:`\ell^1`- and :math:`\ell^2`-norms.
+#
+# We can interpret the :math:`\ell^p`-norms as minimising the least squares penalty
+# under different geometries. This is illustrated by plotting the unit circles
+#
+# .. math::
+#
+#     \left\{
+#     \begin{array}{cl}
+#     \Vert w \Vert_1 &=1 \\
+#     \Vert w \Vert_2^2 &=1 \\
+#     0.8 \Vert w \Vert_1 + 0.1 \Vert w \Vert_2^2 &= 1 \\
+#     \end{array}
+#     \right.
+#
+# in :math:`\mathbb{R}^2`:
+line = np.linspace(-1.2, 1.2, 1001)
+xx, yy = np.meshgrid(line, line)
 
-X, y = load_diabetes(return_X_y=True)
-X /= X.std(axis=0)  # Standardize data (easier to set the l1_ratio parameter)
+l1 = np.abs(xx) + np.abs(yy)
+l2 = xx**2 + yy**2
+elastic_net = 0.8 * l1 + 0.1 * l2
+
+plt.figure()
+ax = plt.gca()
+
+l1_contour = plt.contour(xx, yy, l1, levels=[1], colors="#0072B2")
+l2_contour = plt.contour(xx, yy, l2, levels=[1], colors="#D55E00")
+elastic_net_contour = plt.contour(xx, yy, elastic_net, levels=[1], colors="#009E73")
+
+ax.set_aspect("equal")
+ax.spines["left"].set_position("center")
+ax.spines["right"].set_color("none")
+ax.spines["bottom"].set_position("center")
+ax.spines["top"].set_color("none")
+
+plt.clabel(l1_contour, inline=1, fmt={1.0: r"$\ell^1$"}, manual=[(-1, -1)])
+plt.clabel(l2_contour, inline=1, fmt={1.0: r"$\ell^2$"}, manual=[(-1, -1)])
+plt.clabel(
+    elastic_net_contour,
+    inline=1,
+    fmt={1.0: "Elastic Net"},
+    manual=[(1, -1)],
+)
+
+plt.title(r"Unit Circles in $\mathbb{R}^2$")
+
+plt.tight_layout()
+_ = plt.show()
 
 # %%
+# Algebraically, any solution to this optimization problem depends
+# on :math:`\alpha`. For example, in Lasso, a large :math:`\alpha` forces
+# the least-squares penalty to stay small, which in turn keeps the norm
+# :math:`\Vert w \Vert_1`
+# small. Conversely, a smaller :math:`\alpha` allows the norm
+# :math:`\Vert w \Vert_1`
+# to grow larger.
+#
+# This suggests that the regression coefficients :math:`w` evolve as
+# :math:`\alpha` increases, and we are interested in knowing
+# :math:`w` across a range of :math:`\alpha` values. This is known
+# as the **regularization path**: a list of :math:`w` values corresponding to
+# different :math:`\alpha` values, ranging from small to large.
+#
+# In this example, we plot the regularization paths to show how the magnitudes of
+# the coefficients change as the regularization parameter :math:`\alpha` increases.
+# This demonstrates how model complexity varies with :math:`\alpha`. We then compare
+# the trained coefficients with the true coefficients used to generate the training set,
+# illustrating how regularization helps mitigate overfitting.
+#
+# Creating a Noise-free Regression Dataset
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# We generate a toy dataset with 400 samples and 10 features, suitable for
+# regression analysis. Since the data is noise-free in this example,
+# we can expect our regression model to recover the true coefficients `w` exactly.
+
+X, y, w = make_regression(n_samples=400, n_features=10, coef=True, random_state=42)
+
+# %%
+#
+# Impact of Regularization Parameter on Model Complexity
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Lasso(-LARS) and Elastic Net Models
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
 # Scikit-learn provides the following functions to compute multiple
 # :math:`w` values for various :math:`\alpha` values efficiently:
 #
@@ -80,7 +157,7 @@
 # - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.8`
 #
 
-eps = 5e-3  # the smaller it is the longer is the path
+eps = 3e-4  # the smaller it is the longer is the path
 
 alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps)
 
@@ -89,7 +166,7 @@
 # %%
 # The :func:`~sklearn.linear_model.lasso_path` and
 # :func:`~sklearn.linear_model.enet_path` functions compute
-# :math:`w` with coordinate decent: for each entry of :math:`w`,
+# :math:`w` with **coordinate decent**: for each entry of :math:`w`,
 # the function solves for it optimal value while keeping the others
 # fixed. Since the algorithm iterates until convergence,
 # Lasso doesn't operate in a fixed number of steps based solely
@@ -99,8 +176,8 @@
 alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso")
 
 # %%
-# The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm
-# (see [1]_) to compute the Lasso solution in
+# The Lasso-LARS model uses the **Least Angle Regression (LARS)** algorithm
+# (see [1]_ Algorithm 3.2 on page 74) to compute the Lasso solution in
 # :math:`\min \left\{
 # n_{\operatorname{sample}}-1,n_{\operatorname{feature}}
 # \right\}`
@@ -108,55 +185,277 @@
 # is implemented as :func:`~sklearn.linear_model.LassoLars`
 # and :func:`~sklearn.linear_model.lars_path`.
 #
-# We now present the visualisation of the regularization paths for the diabetes dataset.
+# Ridge Model
+# ~~~~~~~~~~~
+#
+# Next, we compute the coefficients for the Ridge model using the :math:`\alpha`
+# from Elastic Net:
+
+coefs_ridge = []
+for a in alphas_enet:
+    ridge = Ridge(alpha=a)
+    ridge.fit(X, y)
+    coefs_ridge.append(ridge.coef_)
+
+coefs_ridge = np.asarray(coefs_ridge)
+
+# %%
+# Plotting the Regularization Paths
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We now present the visualization of the regularization paths for the dataset.
 # Each model is represented by 10 curves, corresponding to the number of features in the
 # dataset. Each curve shows how a particular coefficient :math:`w_i` changes as
 # :math:`\alpha` increases.
-#
-# Lasso vs Lasso-LARS
-# -------------------
-# In the "Lasso vs LARS Paths" visual,
 
-plt.figure(1)
-for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars):
-    l1 = plt.semilogx(alphas_lasso, coef_lasso, c="#0072B2")
-    l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c="#D55E00")
+model_names = ["Lasso", "Lasso-LARS", "Elastic Net", "Ridge"]
 
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Lasso vs LARS Paths")
-plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right")
-plt.axis("tight")
+model_coefficients = [coefs_lasso.T, coefs_lars.T, coefs_enet.T, coefs_ridge]
+
+model_alphas = [alphas_lasso, alphas_lars, alphas_enet, alphas_enet]
+
+fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(25, 10))
+
+for i in range(len(model_names)):
+    for j in range(len(model_names)):
+        if i == j:
+            axes[i, i].semilogx(model_alphas[i], model_coefficients[i], c="#0072B2")
+
+            axes[i, i].set_title(f"{model_names[i]} Paths", fontsize=14)
+
+        elif j < i:
+            l1 = axes[i, j].semilogx(
+                model_alphas[i], model_coefficients[i], c="#0072B2"
+            )
+
+            l2 = axes[i, j].semilogx(
+                model_alphas[j], model_coefficients[j], linestyle="--", c="#D55E00"
+            )
+
+            axes[i, j].set_title(
+                f"{model_names[j]} vs {model_names[i]} Paths", fontsize=14
+            )
+
+            axes[i, j].legend(
+                (l1[-1], l2[-1]),
+                (f"{model_names[i]}", f"{model_names[j]}"),
+                loc="upper right",
+            )
+
+        else:
+            fig.delaxes(axes[i, j])
+
+fig.supxlabel(r"$\alpha$", fontsize=18)
+fig.supylabel("Coefficients", fontsize=18)
+
+fig.suptitle(
+    "Comparing Regularization Paths: Lasso(-LARS), Ridge, and Elastic Net", fontsize=20
+)
+
+fig.tight_layout(pad=3.0)
 _ = plt.show()
 
 # %%
-# the Lasso and LARS paths appear identical because both models solve
-# the same constrained problem. However, LARS reaches the solution faster than Lasso.
-#
-# Lasso vs Elastic-Net
-# --------------------
-# The "Lasso vs Elastic-Net Paths" visual is more notable.
-
-plt.figure(2)
-for coef_l, coef_e in zip(coefs_lasso, coefs_enet):
-    l1 = plt.semilogx(alphas_lasso, coef_l, c="#0072B2")
-    l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c="#D55E00")
-
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Lasso vs Elastic-Net Paths")
-plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right")
+#
+# * In the "Lasso vs Lasso-LARS Paths" visual,
+#   the Lasso and Lasso-LARS paths appear identical towards the end
+#   because both models solve the same constrained problem.
+#   However, Lasso-LARS reaches the solution faster than Lasso.
+#
+# * The "Lasso vs Elastic-Net Paths" visual is more notable.
+#   Elastic Net's coefficients tend to have smaller absolute values than those of Lasso.
+#   Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards
+#   the end.
+#
+# * In the "Lasso(-LARS) vs Ridge Paths" and "Elastic Net vs Ridge Paths" visuals, the
+#   Ridge model focuses on shrinking all coefficients uniformly, rather than setting
+#   some to exactly zero. As a result, the Ridge model retains all features after
+#   training, unlike the Lasso(-LARS) or Elastic Net models.
+#
+# This demonstrates how different regularization techniques govern
+# the model's complexity:
+#
+# 1. the :math:`\ell^1`-norm constraint encourages sparsity in the solution.
+#
+# 2. the :math:`\ell^2`-norm constraint focuses on shrinkage of the magnitude
+#    of the solution.
+#
+# 3. the Elastic Net constraint provides a balanced compromise.
+#
+# Mitigating Overfitting with Regularization
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Recall that the true coefficient `w` refers to the coefficients of the linear model
+# used to generate the training dataset. In this section, we compare the trained
+# coefficients of Lasso(-LARS), Ridge, and Elastic Net with `w` to demonstrate how
+# regularization can mitigate overfitting. This is achieved by computing the
+# :func:`~sklearn.metrics.mean_squared_error` (MSE) between the true and trained
+# coefficients.
+
+lasso_mse, lars_mse, enet_mse, ridge_mse = [], [], [], []
+
+for coef_lasso, coef_enet, coef_ridge in zip(coefs_lasso.T, coefs_enet.T, coefs_ridge):
+    lasso_mse.append(mean_squared_error(coef_lasso, w))
+    enet_mse.append(mean_squared_error(coef_enet, w))
+    ridge_mse.append(mean_squared_error(coef_ridge, w))
+
+for coef_lars in coefs_lars.T:
+    lars_mse.append(mean_squared_error(coef_lars, w))
+
+lasso_mse = np.asarray(lasso_mse)
+lars_mse = np.asarray(lars_mse)
+enet_mse = np.asarray(enet_mse)
+ridge_mse = np.asarray(ridge_mse)
+
+# %%
+#
+# The idea is that a smaller MSE between the true and trained coefficients implies
+# greater similarity between the coefficients. Thus, if the MSE is small, the
+# trained model captures the underlying pattern of the training data well.
+# However, this can also indicate that the trained model may not perform well on
+# generalised data, as the pattern may not hold for unseen data.
+# This is essentially the overfitting problem.
+#
+# The following visualization demonstrates how the MSE changes for different trained
+# models as the regularization parameter :math:`\alpha` increases.
+
+plt.figure()
+l1 = plt.semilogx(alphas_lasso, lasso_mse.T, c="#0072B2")
+l2 = plt.semilogx(alphas_lars, lars_mse.T, c="#D55E00")
+l3 = plt.semilogx(alphas_enet, enet_mse.T, c="#009E73")
+l4 = plt.semilogx(alphas_enet, ridge_mse, c="#F0E442")
+
+plt.xlabel(r"$\alpha$")
+plt.ylabel("Mean Squared Error")
+plt.title("Coefficient Error Across Regularization Strengths")
+plt.legend(
+    (l1[-1], l2[-1], l3[-1], l4[-1]),
+    ("Lasso", "LARS", "Elastic Net", "Ridge"),
+    loc="upper left",
+)
+
 plt.axis("tight")
 _ = plt.show()
 
 # %%
-# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso.
-# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards the
-# end. This demonstrates how the :math:`\ell^1`-norm constraint encourages sparsity in
-# the solution, while combining it with the :math:`\ell^2`-norm provides a balanced
-# compromise.
+#
+# In the visualization, for small values of :math:`\alpha`, since our synthetic data is
+# noise-free, the trained coefficients of Lasso(-LARS), Ridge, and Elastic Net are
+# similar to the true coefficient `w` (with MSE close to 0). This indicates that the
+# models capture the intricate details of the training data well.
+#
+# As :math:`\alpha` increases, the MSE also increases. This improves the models' ability
+# to generalise to unseen data (e.g., if the data were noisy), but it also risks
+# degrading model performance if the regularization becomes too strong.
+#
+# Regularization in Logistic Regression
+# -------------------------------------
+#
+# Regularization can also be applied to Logistic Regression when working on
+# classification tasks. scikit-learn's :func:`~sklearn.linear_model.LogisticRegression`
+# enables users to apply regularization using the `penalty` parameter:
+#
+# * `l1`: :math:`\ell^1`-regularization, similar to the Lasso model
+# * `l2`: :math:`\ell^2`-regularization, similar to the Ridge model
+# * `elasticnet`: Combined with the `l1_ratio` parameter for a mix of :math:`\ell^1`
+#   and :math:`\ell^2`
+#
+# Additionally, the `C` parameter controls the inverse of the regularization strength.
+# Smaller values of `C` apply stronger regularization.
+#
+# We demonstrate the effect of regularization by creating a synthetic classification
+# dataset.
+#
+
+X, y = make_classification(
+    n_samples=400,
+    n_features=64,
+    n_informative=64,
+    n_redundant=0,
+    n_classes=2,
+    n_clusters_per_class=1,
+    random_state=42,
+)
+
+# %%
+#
+# In this synthetic binary classification dataset, there are 400 samples,
+# each with 64 features. This toy dataset is noise-free to maintain consistency with
+# our earlier regression example.
+#
+# As noted in the regression example, :math:`\ell^1`-regularization may set some
+# coefficients exactly to zero. For extreme values of `C`, the trained coefficients
+# may even become the zero vector. To address this, scikit-learn provides the
+# :func:`~sklearn.svm.l1_min_c` function, which computes the minimum value of the
+# regularization strength `C` at which the model begins to learn meaningful patterns
+# (i.e., some coefficients become non-zero).
+#
+
+cs = l1_min_c(X, y, loss="log") * np.logspace(0, 10, 16)
+
+# %%
+#
+# We now plot blah-blah-blah
+#
+
+l1_ratio = 0.8  # L1 weight in the Elastic-Net regularization
+
+fig, axes = plt.subplots(3, 3)
+
+# Set regularization parameter
+for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)):
+    # Increase tolerance for short training time
+    clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga")
+    clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga")
+    clf_en_LR = LogisticRegression(
+        C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01
+    )
+    clf_l1_LR.fit(X, y)
+    clf_l2_LR.fit(X, y)
+    clf_en_LR.fit(X, y)
+
+    coef_l1_LR = clf_l1_LR.coef_.ravel()
+    coef_l2_LR = clf_l2_LR.coef_.ravel()
+    coef_en_LR = clf_en_LR.coef_.ravel()
+
+    # coef_l1_LR contains zeros due to the
+    # L1 sparsity inducing norm
+
+    sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
+    sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100
+    sparsity_en_LR = np.mean(coef_en_LR == 0) * 100
+
+    print(f"C={C:.2f}")
+    print(f"{'Sparsity with L1 penalty:':<40} {sparsity_l1_LR:.2f}%")
+    print(f"{'Sparsity with Elastic-Net penalty:':<40} {sparsity_en_LR:.2f}%")
+    print(f"{'Sparsity with L2 penalty:':<40} {sparsity_l2_LR:.2f}%")
+    print(f"{'Score with L1 penalty:':<40} {clf_l1_LR.score(X, y):.2f}")
+    print(f"{'Score with Elastic-Net penalty:':<40} {clf_en_LR.score(X, y):.2f}")
+    print(f"{'Score with L2 penalty:':<40} {clf_l2_LR.score(X, y):.2f}")
+
+    if i == 0:
+        axes_row[0].set_title(r"$\ell^1$ penalty")
+        axes_row[1].set_title(f"Elastic-Net\n {l1_ratio = }")
+        axes_row[2].set_title(r"$\ell^2$ penalty")
+
+    for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]):
+        ax.imshow(
+            np.abs(coefs.reshape(8, 8)),
+            interpolation="nearest",
+            cmap="binary",
+            vmax=1,
+            vmin=0,
+        )
+        ax.set_xticks(())
+        ax.set_yticks(())
+
+    axes_row[0].set_ylabel(f"C = {C:.2f}")
+
+_ = plt.show()
+# %%
 #
 # .. rubric:: References
 #
-# .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
-#        Friedman J., Algorithm 3.2, p. 74, 2008.
+# .. [1] Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical
+#        Learning: Data Mining, Inference, and Prediction. New York,
+#        NY: Springer New York.

From add4294969f4f8f7ad14d1ca3e251e11e287586b Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 3 Mar 2025 18:42:10 -0800
Subject: [PATCH 11/13] Revert "update
 'plot_lasso_lasso_lars_elasticnet_path.py'"

This reverts commit 5bcfaf8f356378150551f62b1b861d672d3474e0.
---
 .../plot_lasso_lasso_lars_elasticnet_path.py  | 503 ++++--------------
 1 file changed, 102 insertions(+), 401 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index 0fe8534e0fd78..d55d2e921dff6 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -2,153 +2,76 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 """
-=================================================
-Regularization in Linear and Logistic Regressions
-=================================================
-
-This example explores regularization techniques for linear and logistic regression
-in both regression and classification tasks. It demonstrates how the
-regularization parameter :math:`\\alpha` can be adjusted to control the complexity
-of the trained coefficients :math:`w` and reduce overfitting.
+========================================
+Lasso, Lasso-LARS, and Elastic Net paths
+========================================
+
+This example shows how to compute the Lasso,
+Lasso-LARS, and Elastic Net regularization paths. It illustrates the
+relationship between the regularization parameter :math:`\\alpha`
+and the coefficients :math:`w`.
+
+When performing linear regression on a given dataset
+:math:`(X, y)`, regularization terms can be added to
+control the model's complexity.
+Scikit-learn provides the following regularization techniques:
+
+- :func:`~sklearn.linear_model.Lasso`
+- :func:`~sklearn.linear_model.LassoLars`
+- :func:`~sklearn.linear_model.ElasticNet` with `l1_ratio=0.8`
+
+Mathematically, these are formulated by minimising the constrained
+least-squares penalty:
+
+.. math::
+
+    \\min_{w} \\frac{1}{2n_{\\operatorname{sample}}}
+    \\vert \\vert Xw - y \\vert \\vert^2_2 +
+    \\left\\{
+    \\begin{array}{cl}
+    \\alpha \\vert \\vert w \\vert \\vert_1 & \\mbox{Lasso(-LARS)} \\\\
+    \\frac{4\\alpha}{5} \\vert \\vert w \\vert \\vert_1  +
+    \\frac{\\alpha}{10} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\
+    \\end{array}
+    \\right.
+
+Thus, the Lasso model includes the :math:`\\ell^1`-norm of the regression
+coefficients in the penalty, while the Elastic Net model
+incorporates both :math:`\\ell^1`- and :math:`\\ell^2`-norms.
+
+Any solution to this optimisation problem depends on :math:`\\alpha`.
+For example, in Lasso, a large :math:`\\alpha` forces the least-squares
+penalty to stay small, which in turn keeps the norm
+:math:`\\vert \\vert w \\vert \\vert_1`
+small. Conversely, a smaller :math:`\\alpha` allows the norm
+:math:`\\vert \\vert w \\vert \\vert_1`
+to grow larger.
+
+This suggests that the regression coefficients :math:`w` evolve as
+:math:`\\alpha` increases, and we are interested in knowing
+:math:`w` across a range of :math:`\\alpha` values. This is known
+as the **regularization path**: a list of :math:`w` values corresponding to
+different :math:`\\alpha` values, ranging from small to large.
+In this example, we plot the regularization paths to show how the sizes of the
+coefficients change as the regularization parameter increases.
 """
-
+# %%
 import matplotlib.pyplot as plt
-import numpy as np
 
-from sklearn.datasets import make_classification, make_regression
-from sklearn.linear_model import (
-    LogisticRegression,
-    Ridge,
-    enet_path,
-    lars_path,
-    lasso_path,
-)
-from sklearn.metrics import mean_squared_error
-from sklearn.svm import l1_min_c
+from sklearn.datasets import load_diabetes
+from sklearn.linear_model import enet_path, lars_path, lasso_path
 
 # %%
+# The Diabetes Dataset
+# --------------------
 #
-# Regularization in Linear Regression
-# -----------------------------------
-#
-# When performing linear regression on a given dataset
-# :math:`(X, y)`, regularization terms can be added to
-# control the model's complexity and mitigate overfitting.
-# Scikit-learn provides the following regularization techniques:
-#
-# - :func:`~sklearn.linear_model.Lasso`
-# - :func:`~sklearn.linear_model.LassoLars`
-# - :func:`~sklearn.linear_model.Ridge`
-# - :func:`~sklearn.linear_model.ElasticNet` (with `l1_ratio=0.8`
-#   in this example)
-#
-# Mathematically, these are formulated by minimizing the constrained
-# least-squares penalty:
-#
-# .. math::
-#
-#     \min_{w} \frac{1}{2n_{\operatorname{sample}}}
-#     \Vert Xw - y \Vert^2_2 +
-#     \left\{
-#     \begin{array}{cl}
-#     \alpha \Vert w \Vert_1 & \mbox{Lasso(-LARS)} \\
-#     \alpha \Vert w \Vert_2^2 & \mbox{Ridge} \\
-#     \frac{4\alpha}{5} \Vert w \Vert_1  +
-#     \frac{\alpha}{10} \Vert w \Vert^2_2& \mbox{Elastic Net} \\
-#     \end{array}
-#     \right.
-#
-# Thus, the Lasso model (resp. Ridge model) includes the :math:`\ell^1`-norm
-# (resp. :math:`\ell^2`-norm) of the regression
-# coefficients in the penalty, while the Elastic Net model
-# incorporates both :math:`\ell^1`- and :math:`\ell^2`-norms.
-#
-# We can interpret the :math:`\ell^p`-norms as minimising the least squares penalty
-# under different geometries. This is illustrated by plotting the unit circles
-#
-# .. math::
-#
-#     \left\{
-#     \begin{array}{cl}
-#     \Vert w \Vert_1 &=1 \\
-#     \Vert w \Vert_2^2 &=1 \\
-#     0.8 \Vert w \Vert_1 + 0.1 \Vert w \Vert_2^2 &= 1 \\
-#     \end{array}
-#     \right.
-#
-# in :math:`\mathbb{R}^2`:
-line = np.linspace(-1.2, 1.2, 1001)
-xx, yy = np.meshgrid(line, line)
+# We use the :func:`diabetes dataset <sklearn.datasets.load_diabetes>` to plot
+# the regression coefficients for Lasso and Elastic Net.
 
-l1 = np.abs(xx) + np.abs(yy)
-l2 = xx**2 + yy**2
-elastic_net = 0.8 * l1 + 0.1 * l2
-
-plt.figure()
-ax = plt.gca()
-
-l1_contour = plt.contour(xx, yy, l1, levels=[1], colors="#0072B2")
-l2_contour = plt.contour(xx, yy, l2, levels=[1], colors="#D55E00")
-elastic_net_contour = plt.contour(xx, yy, elastic_net, levels=[1], colors="#009E73")
-
-ax.set_aspect("equal")
-ax.spines["left"].set_position("center")
-ax.spines["right"].set_color("none")
-ax.spines["bottom"].set_position("center")
-ax.spines["top"].set_color("none")
-
-plt.clabel(l1_contour, inline=1, fmt={1.0: r"$\ell^1$"}, manual=[(-1, -1)])
-plt.clabel(l2_contour, inline=1, fmt={1.0: r"$\ell^2$"}, manual=[(-1, -1)])
-plt.clabel(
-    elastic_net_contour,
-    inline=1,
-    fmt={1.0: "Elastic Net"},
-    manual=[(1, -1)],
-)
-
-plt.title(r"Unit Circles in $\mathbb{R}^2$")
-
-plt.tight_layout()
-_ = plt.show()
+X, y = load_diabetes(return_X_y=True)
+X /= X.std(axis=0)  # Standardize data (easier to set the l1_ratio parameter)
 
 # %%
-# Algebraically, any solution to this optimization problem depends
-# on :math:`\alpha`. For example, in Lasso, a large :math:`\alpha` forces
-# the least-squares penalty to stay small, which in turn keeps the norm
-# :math:`\Vert w \Vert_1`
-# small. Conversely, a smaller :math:`\alpha` allows the norm
-# :math:`\Vert w \Vert_1`
-# to grow larger.
-#
-# This suggests that the regression coefficients :math:`w` evolve as
-# :math:`\alpha` increases, and we are interested in knowing
-# :math:`w` across a range of :math:`\alpha` values. This is known
-# as the **regularization path**: a list of :math:`w` values corresponding to
-# different :math:`\alpha` values, ranging from small to large.
-#
-# In this example, we plot the regularization paths to show how the magnitudes of
-# the coefficients change as the regularization parameter :math:`\alpha` increases.
-# This demonstrates how model complexity varies with :math:`\alpha`. We then compare
-# the trained coefficients with the true coefficients used to generate the training set,
-# illustrating how regularization helps mitigate overfitting.
-#
-# Creating a Noise-free Regression Dataset
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#
-# We generate a toy dataset with 400 samples and 10 features, suitable for
-# regression analysis. Since the data is noise-free in this example,
-# we can expect our regression model to recover the true coefficients `w` exactly.
-
-X, y, w = make_regression(n_samples=400, n_features=10, coef=True, random_state=42)
-
-# %%
-#
-# Impact of Regularization Parameter on Model Complexity
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#
-# Lasso(-LARS) and Elastic Net Models
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
 # Scikit-learn provides the following functions to compute multiple
 # :math:`w` values for various :math:`\alpha` values efficiently:
 #
@@ -157,7 +80,7 @@
 # - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.8`
 #
 
-eps = 3e-4  # the smaller it is the longer is the path
+eps = 5e-3  # the smaller it is the longer is the path
 
 alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps)
 
@@ -166,7 +89,7 @@
 # %%
 # The :func:`~sklearn.linear_model.lasso_path` and
 # :func:`~sklearn.linear_model.enet_path` functions compute
-# :math:`w` with **coordinate decent**: for each entry of :math:`w`,
+# :math:`w` with coordinate decent: for each entry of :math:`w`,
 # the function solves for it optimal value while keeping the others
 # fixed. Since the algorithm iterates until convergence,
 # Lasso doesn't operate in a fixed number of steps based solely
@@ -176,8 +99,8 @@
 alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso")
 
 # %%
-# The Lasso-LARS model uses the **Least Angle Regression (LARS)** algorithm
-# (see [1]_ Algorithm 3.2 on page 74) to compute the Lasso solution in
+# The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm
+# (see [1]_) to compute the Lasso solution in
 # :math:`\min \left\{
 # n_{\operatorname{sample}}-1,n_{\operatorname{feature}}
 # \right\}`
@@ -185,277 +108,55 @@
 # is implemented as :func:`~sklearn.linear_model.LassoLars`
 # and :func:`~sklearn.linear_model.lars_path`.
 #
-# Ridge Model
-# ~~~~~~~~~~~
-#
-# Next, we compute the coefficients for the Ridge model using the :math:`\alpha`
-# from Elastic Net:
-
-coefs_ridge = []
-for a in alphas_enet:
-    ridge = Ridge(alpha=a)
-    ridge.fit(X, y)
-    coefs_ridge.append(ridge.coef_)
-
-coefs_ridge = np.asarray(coefs_ridge)
-
-# %%
-# Plotting the Regularization Paths
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# We now present the visualization of the regularization paths for the dataset.
+# We now present the visualisation of the regularization paths for the diabetes dataset.
 # Each model is represented by 10 curves, corresponding to the number of features in the
 # dataset. Each curve shows how a particular coefficient :math:`w_i` changes as
 # :math:`\alpha` increases.
-
-model_names = ["Lasso", "Lasso-LARS", "Elastic Net", "Ridge"]
-
-model_coefficients = [coefs_lasso.T, coefs_lars.T, coefs_enet.T, coefs_ridge]
-
-model_alphas = [alphas_lasso, alphas_lars, alphas_enet, alphas_enet]
-
-fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(25, 10))
-
-for i in range(len(model_names)):
-    for j in range(len(model_names)):
-        if i == j:
-            axes[i, i].semilogx(model_alphas[i], model_coefficients[i], c="#0072B2")
-
-            axes[i, i].set_title(f"{model_names[i]} Paths", fontsize=14)
-
-        elif j < i:
-            l1 = axes[i, j].semilogx(
-                model_alphas[i], model_coefficients[i], c="#0072B2"
-            )
-
-            l2 = axes[i, j].semilogx(
-                model_alphas[j], model_coefficients[j], linestyle="--", c="#D55E00"
-            )
-
-            axes[i, j].set_title(
-                f"{model_names[j]} vs {model_names[i]} Paths", fontsize=14
-            )
-
-            axes[i, j].legend(
-                (l1[-1], l2[-1]),
-                (f"{model_names[i]}", f"{model_names[j]}"),
-                loc="upper right",
-            )
-
-        else:
-            fig.delaxes(axes[i, j])
-
-fig.supxlabel(r"$\alpha$", fontsize=18)
-fig.supylabel("Coefficients", fontsize=18)
-
-fig.suptitle(
-    "Comparing Regularization Paths: Lasso(-LARS), Ridge, and Elastic Net", fontsize=20
-)
-
-fig.tight_layout(pad=3.0)
-_ = plt.show()
-
-# %%
-#
-# * In the "Lasso vs Lasso-LARS Paths" visual,
-#   the Lasso and Lasso-LARS paths appear identical towards the end
-#   because both models solve the same constrained problem.
-#   However, Lasso-LARS reaches the solution faster than Lasso.
-#
-# * The "Lasso vs Elastic-Net Paths" visual is more notable.
-#   Elastic Net's coefficients tend to have smaller absolute values than those of Lasso.
-#   Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards
-#   the end.
-#
-# * In the "Lasso(-LARS) vs Ridge Paths" and "Elastic Net vs Ridge Paths" visuals, the
-#   Ridge model focuses on shrinking all coefficients uniformly, rather than setting
-#   some to exactly zero. As a result, the Ridge model retains all features after
-#   training, unlike the Lasso(-LARS) or Elastic Net models.
-#
-# This demonstrates how different regularization techniques govern
-# the model's complexity:
-#
-# 1. the :math:`\ell^1`-norm constraint encourages sparsity in the solution.
-#
-# 2. the :math:`\ell^2`-norm constraint focuses on shrinkage of the magnitude
-#    of the solution.
-#
-# 3. the Elastic Net constraint provides a balanced compromise.
-#
-# Mitigating Overfitting with Regularization
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#
-# Recall that the true coefficient `w` refers to the coefficients of the linear model
-# used to generate the training dataset. In this section, we compare the trained
-# coefficients of Lasso(-LARS), Ridge, and Elastic Net with `w` to demonstrate how
-# regularization can mitigate overfitting. This is achieved by computing the
-# :func:`~sklearn.metrics.mean_squared_error` (MSE) between the true and trained
-# coefficients.
-
-lasso_mse, lars_mse, enet_mse, ridge_mse = [], [], [], []
-
-for coef_lasso, coef_enet, coef_ridge in zip(coefs_lasso.T, coefs_enet.T, coefs_ridge):
-    lasso_mse.append(mean_squared_error(coef_lasso, w))
-    enet_mse.append(mean_squared_error(coef_enet, w))
-    ridge_mse.append(mean_squared_error(coef_ridge, w))
-
-for coef_lars in coefs_lars.T:
-    lars_mse.append(mean_squared_error(coef_lars, w))
-
-lasso_mse = np.asarray(lasso_mse)
-lars_mse = np.asarray(lars_mse)
-enet_mse = np.asarray(enet_mse)
-ridge_mse = np.asarray(ridge_mse)
-
-# %%
 #
-# The idea is that a smaller MSE between the true and trained coefficients implies
-# greater similarity between the coefficients. Thus, if the MSE is small, the
-# trained model captures the underlying pattern of the training data well.
-# However, this can also indicate that the trained model may not perform well on
-# generalised data, as the pattern may not hold for unseen data.
-# This is essentially the overfitting problem.
-#
-# The following visualization demonstrates how the MSE changes for different trained
-# models as the regularization parameter :math:`\alpha` increases.
-
-plt.figure()
-l1 = plt.semilogx(alphas_lasso, lasso_mse.T, c="#0072B2")
-l2 = plt.semilogx(alphas_lars, lars_mse.T, c="#D55E00")
-l3 = plt.semilogx(alphas_enet, enet_mse.T, c="#009E73")
-l4 = plt.semilogx(alphas_enet, ridge_mse, c="#F0E442")
+# Lasso vs Lasso-LARS
+# -------------------
+# In the "Lasso vs LARS Paths" visual,
 
-plt.xlabel(r"$\alpha$")
-plt.ylabel("Mean Squared Error")
-plt.title("Coefficient Error Across Regularization Strengths")
-plt.legend(
-    (l1[-1], l2[-1], l3[-1], l4[-1]),
-    ("Lasso", "LARS", "Elastic Net", "Ridge"),
-    loc="upper left",
-)
+plt.figure(1)
+for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars):
+    l1 = plt.semilogx(alphas_lasso, coef_lasso, c="#0072B2")
+    l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c="#D55E00")
 
+plt.xlabel("alpha")
+plt.ylabel("coefficients")
+plt.title("Lasso vs LARS Paths")
+plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right")
 plt.axis("tight")
 _ = plt.show()
 
 # %%
-#
-# In the visualization, for small values of :math:`\alpha`, since our synthetic data is
-# noise-free, the trained coefficients of Lasso(-LARS), Ridge, and Elastic Net are
-# similar to the true coefficient `w` (with MSE close to 0). This indicates that the
-# models capture the intricate details of the training data well.
-#
-# As :math:`\alpha` increases, the MSE also increases. This improves the models' ability
-# to generalise to unseen data (e.g., if the data were noisy), but it also risks
-# degrading model performance if the regularization becomes too strong.
-#
-# Regularization in Logistic Regression
-# -------------------------------------
-#
-# Regularization can also be applied to Logistic Regression when working on
-# classification tasks. scikit-learn's :func:`~sklearn.linear_model.LogisticRegression`
-# enables users to apply regularization using the `penalty` parameter:
-#
-# * `l1`: :math:`\ell^1`-regularization, similar to the Lasso model
-# * `l2`: :math:`\ell^2`-regularization, similar to the Ridge model
-# * `elasticnet`: Combined with the `l1_ratio` parameter for a mix of :math:`\ell^1`
-#   and :math:`\ell^2`
-#
-# Additionally, the `C` parameter controls the inverse of the regularization strength.
-# Smaller values of `C` apply stronger regularization.
-#
-# We demonstrate the effect of regularization by creating a synthetic classification
-# dataset.
-#
-
-X, y = make_classification(
-    n_samples=400,
-    n_features=64,
-    n_informative=64,
-    n_redundant=0,
-    n_classes=2,
-    n_clusters_per_class=1,
-    random_state=42,
-)
-
-# %%
-#
-# In this synthetic binary classification dataset, there are 400 samples,
-# each with 64 features. This toy dataset is noise-free to maintain consistency with
-# our earlier regression example.
-#
-# As noted in the regression example, :math:`\ell^1`-regularization may set some
-# coefficients exactly to zero. For extreme values of `C`, the trained coefficients
-# may even become the zero vector. To address this, scikit-learn provides the
-# :func:`~sklearn.svm.l1_min_c` function, which computes the minimum value of the
-# regularization strength `C` at which the model begins to learn meaningful patterns
-# (i.e., some coefficients become non-zero).
-#
-
-cs = l1_min_c(X, y, loss="log") * np.logspace(0, 10, 16)
-
-# %%
-#
-# We now plot blah-blah-blah
-#
-
-l1_ratio = 0.8  # L1 weight in the Elastic-Net regularization
-
-fig, axes = plt.subplots(3, 3)
-
-# Set regularization parameter
-for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)):
-    # Increase tolerance for short training time
-    clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga")
-    clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga")
-    clf_en_LR = LogisticRegression(
-        C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01
-    )
-    clf_l1_LR.fit(X, y)
-    clf_l2_LR.fit(X, y)
-    clf_en_LR.fit(X, y)
-
-    coef_l1_LR = clf_l1_LR.coef_.ravel()
-    coef_l2_LR = clf_l2_LR.coef_.ravel()
-    coef_en_LR = clf_en_LR.coef_.ravel()
-
-    # coef_l1_LR contains zeros due to the
-    # L1 sparsity inducing norm
-
-    sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
-    sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100
-    sparsity_en_LR = np.mean(coef_en_LR == 0) * 100
-
-    print(f"C={C:.2f}")
-    print(f"{'Sparsity with L1 penalty:':<40} {sparsity_l1_LR:.2f}%")
-    print(f"{'Sparsity with Elastic-Net penalty:':<40} {sparsity_en_LR:.2f}%")
-    print(f"{'Sparsity with L2 penalty:':<40} {sparsity_l2_LR:.2f}%")
-    print(f"{'Score with L1 penalty:':<40} {clf_l1_LR.score(X, y):.2f}")
-    print(f"{'Score with Elastic-Net penalty:':<40} {clf_en_LR.score(X, y):.2f}")
-    print(f"{'Score with L2 penalty:':<40} {clf_l2_LR.score(X, y):.2f}")
-
-    if i == 0:
-        axes_row[0].set_title(r"$\ell^1$ penalty")
-        axes_row[1].set_title(f"Elastic-Net\n {l1_ratio = }")
-        axes_row[2].set_title(r"$\ell^2$ penalty")
-
-    for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]):
-        ax.imshow(
-            np.abs(coefs.reshape(8, 8)),
-            interpolation="nearest",
-            cmap="binary",
-            vmax=1,
-            vmin=0,
-        )
-        ax.set_xticks(())
-        ax.set_yticks(())
-
-    axes_row[0].set_ylabel(f"C = {C:.2f}")
-
+# the Lasso and LARS paths appear identical because both models solve
+# the same constrained problem. However, LARS reaches the solution faster than Lasso.
+#
+# Lasso vs Elastic-Net
+# --------------------
+# The "Lasso vs Elastic-Net Paths" visual is more notable.
+
+plt.figure(2)
+for coef_l, coef_e in zip(coefs_lasso, coefs_enet):
+    l1 = plt.semilogx(alphas_lasso, coef_l, c="#0072B2")
+    l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c="#D55E00")
+
+plt.xlabel("alpha")
+plt.ylabel("coefficients")
+plt.title("Lasso vs Elastic-Net Paths")
+plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right")
+plt.axis("tight")
 _ = plt.show()
+
 # %%
+# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso.
+# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards the
+# end. This demonstrates how the :math:`\ell^1`-norm constraint encourages sparsity in
+# the solution, while combining it with the :math:`\ell^2`-norm provides a balanced
+# compromise.
 #
 # .. rubric:: References
 #
-# .. [1] Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical
-#        Learning: Data Mining, Inference, and Prediction. New York,
-#        NY: Springer New York.
+# .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
+#        Friedman J., Algorithm 3.2, p. 74, 2008.

From a9c4e84f3fab3219aa64278c72d0d4361696f633 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 3 Mar 2025 19:16:07 -0800
Subject: [PATCH 12/13] added `plot_regularization.py`, updated `doc/conf.py`.

---
 doc/conf.py                                  |  22 +-
 examples/linear_model/plot_regularization.py | 461 +++++++++++++++++++
 2 files changed, 481 insertions(+), 2 deletions(-)
 create mode 100644 examples/linear_model/plot_regularization.py

diff --git a/doc/conf.py b/doc/conf.py
index f749b188b3274..7f8b011746852 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -477,10 +477,28 @@ def add_js_css_files(app, pagename, templatename, context, doctree):
         "auto_examples/model_selection/plot_nested_cross_validation_iris"
     ),
     "auto_examples/linear_model/plot_lasso_lars": (
-        "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path"
+        "auto_examples/linear_model/plot_regularization"
     ),
     "auto_examples/linear_model/plot_lasso_coordinate_descent_path": (
-        "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path"
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path": (
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_ridge_coeffs": (
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_ridge_path": (
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_sgd_penalties": (
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_logistic_path": (
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_logistic_l1_l2_sparsity": (
+        "auto_examples/linear_model/plot_regularization"
     ),
     "auto_examples/cluster/plot_color_quantization": (
         "auto_examples/cluster/plot_face_compress"
diff --git a/examples/linear_model/plot_regularization.py b/examples/linear_model/plot_regularization.py
new file mode 100644
index 0000000000000..610a59b3736dd
--- /dev/null
+++ b/examples/linear_model/plot_regularization.py
@@ -0,0 +1,461 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+=================================================
+Regularization in Linear and Logistic Regressions
+=================================================
+
+This example explores regularization techniques for linear and logistic regression
+in both regression and classification tasks. It demonstrates how the
+regularization parameter :math:`\\alpha` can be adjusted to control the complexity
+of the trained coefficients :math:`w` and reduce overfitting.
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.datasets import make_classification, make_regression
+from sklearn.linear_model import (
+    LogisticRegression,
+    Ridge,
+    enet_path,
+    lars_path,
+    lasso_path,
+)
+from sklearn.metrics import mean_squared_error
+from sklearn.svm import l1_min_c
+
+# %%
+#
+# Regularization in Linear Regression
+# -----------------------------------
+#
+# When performing linear regression on a given dataset
+# :math:`(X, y)`, regularization terms can be added to
+# control the model's complexity and mitigate overfitting.
+# Scikit-learn provides the following regularization techniques:
+#
+# - :func:`~sklearn.linear_model.Lasso`
+# - :func:`~sklearn.linear_model.LassoLars`
+# - :func:`~sklearn.linear_model.Ridge`
+# - :func:`~sklearn.linear_model.ElasticNet` (with `l1_ratio=0.5`
+#   in this example)
+#
+# Mathematically, these are formulated by minimizing the constrained
+# least-squares penalty:
+#
+# .. math::
+#
+#     \min_{w} \frac{1}{2n_{\operatorname{sample}}}
+#     \Vert Xw - y \Vert^2_2 +
+#     \left\{
+#     \begin{array}{cl}
+#     \alpha \Vert w \Vert_1 & \mbox{Lasso(-LARS)} \\
+#     \alpha \Vert w \Vert_2^2 & \mbox{Ridge} \\
+#     \frac{\alpha}{2} \Vert w \Vert_1  +
+#     \frac{\alpha}{4} \Vert w \Vert^2_2& \mbox{Elastic Net} \\
+#     \end{array}
+#     \right.
+#
+# Thus, the Lasso model (resp. Ridge model) includes the :math:`\ell^1`-norm
+# (resp. :math:`\ell^2`-norm) of the regression
+# coefficients in the penalty, while the Elastic Net model
+# incorporates both :math:`\ell^1`- and :math:`\ell^2`-norms.
+#
+# We can interpret the :math:`\ell^p`-norms as minimising the least squares penalty
+# under different geometries. This is illustrated by plotting the unit circles
+#
+# .. math::
+#
+#     \left\{
+#     \begin{array}{cl}
+#     \Vert w \Vert_1 &=1 \\
+#     \Vert w \Vert_2^2 &=1 \\
+#     0.5 \Vert w \Vert_1 + 0.25 \Vert w \Vert_2^2 &= 1 \\
+#     \end{array}
+#     \right.
+#
+# in :math:`\mathbb{R}^2`:
+line = np.linspace(-1.2, 1.2, 1001)
+xx, yy = np.meshgrid(line, line)
+
+l1 = np.abs(xx) + np.abs(yy)
+l2 = xx**2 + yy**2
+elastic_net = 0.5 * l1 + 0.25 * l2
+
+plt.figure()
+ax = plt.gca()
+
+l1_contour = plt.contour(xx, yy, l1, levels=[1], colors="#0072B2")
+l2_contour = plt.contour(xx, yy, l2, levels=[1], colors="#D55E00")
+elastic_net_contour = plt.contour(xx, yy, elastic_net, levels=[1], colors="#009E73")
+
+ax.set_aspect("equal")
+ax.spines["left"].set_position("center")
+ax.spines["right"].set_color("none")
+ax.spines["bottom"].set_position("center")
+ax.spines["top"].set_color("none")
+
+plt.clabel(l1_contour, inline=1, fmt={1.0: r"$\ell^1$"}, manual=[(-1, -1)])
+plt.clabel(l2_contour, inline=1, fmt={1.0: r"$\ell^2$"}, manual=[(-1, -1)])
+plt.clabel(
+    elastic_net_contour,
+    inline=1,
+    fmt={1.0: "Elastic Net"},
+    manual=[(1, -2)],
+)
+
+plt.title(r"Unit Circles in $\mathbb{R}^2$")
+
+plt.tight_layout()
+_ = plt.show()
+
+# %%
+# Algebraically, any solution to this optimization problem depends
+# on :math:`\alpha`. For example, in Lasso, a large :math:`\alpha` forces
+# the least-squares penalty to stay small, which in turn keeps the norm
+# :math:`\Vert w \Vert_1`
+# small. Conversely, a smaller :math:`\alpha` allows the norm
+# :math:`\Vert w \Vert_1`
+# to grow larger.
+#
+# This suggests that the regression coefficients :math:`w` evolve as
+# :math:`\alpha` increases, and we are interested in knowing
+# :math:`w` across a range of :math:`\alpha` values. This is known
+# as the **regularization path**: a list of :math:`w` values corresponding to
+# different :math:`\alpha` values, ranging from small to large.
+#
+# In this example, we plot the regularization paths to show how the magnitudes of
+# the coefficients change as the regularization parameter :math:`\alpha` increases.
+# This demonstrates how model complexity varies with :math:`\alpha`. We then compare
+# the trained coefficients with the true coefficients used to generate the training set,
+# illustrating how regularization helps mitigate overfitting.
+#
+# Creating a Noise-free Regression Dataset
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# We generate a toy dataset with 400 samples and 10 features, suitable for
+# regression analysis. Since the data is noise-free in this example,
+# we can expect our regression model to recover the true coefficients `w` exactly.
+
+X, y, w = make_regression(n_samples=400, n_features=10, coef=True, random_state=42)
+
+# %%
+#
+# Impact of Regularization Parameter on Model Complexity
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Lasso(-LARS) and Elastic Net Models
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Scikit-learn provides the following functions to compute multiple
+# :math:`w` values for various :math:`\alpha` values efficiently:
+#
+# - :func:`~sklearn.linear_model.lasso_path`
+# - :func:`~sklearn.linear_model.lars_path`
+# - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.5`
+#
+
+eps = 3e-4  # the smaller it is the longer is the path
+
+alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps)
+
+alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.5)
+
+# %%
+# The :func:`~sklearn.linear_model.lasso_path` and
+# :func:`~sklearn.linear_model.enet_path` functions compute
+# :math:`w` with **coordinate decent**: for each entry of :math:`w`,
+# the function solves for it optimal value while keeping the others
+# fixed. Since the algorithm iterates until convergence,
+# Lasso doesn't operate in a fixed number of steps based solely
+# on the dataset's size, which can make it take longer to run.
+# In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps.
+
+alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso")
+
+# %%
+# The Lasso-LARS model uses the **Least Angle Regression (LARS)** algorithm
+# (see [1]_ Algorithm 3.2 on page 74) to compute the Lasso solution in
+# :math:`\min \left\{
+# n_{\operatorname{sample}}-1,n_{\operatorname{feature}}
+# \right\}`
+# steps. This provides an efficient algorithm for computing the entire Lasso path, and
+# is implemented as :func:`~sklearn.linear_model.LassoLars`
+# and :func:`~sklearn.linear_model.lars_path`.
+#
+# Ridge Model
+# ~~~~~~~~~~~
+#
+# Next, we compute the coefficients for the Ridge model using the :math:`\alpha`
+# from Elastic Net:
+
+coefs_ridge = []
+for a in alphas_enet:
+    ridge = Ridge(alpha=a)
+    ridge.fit(X, y)
+    coefs_ridge.append(ridge.coef_)
+
+coefs_ridge = np.asarray(coefs_ridge)
+
+# %%
+# Plotting the Regularization Paths
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We now present the visualization of the regularization paths for the dataset.
+# Each model is represented by 10 curves, corresponding to the number of features in the
+# dataset. Each curve shows how a particular coefficient :math:`w_i` changes as
+# :math:`\alpha` increases.
+
+model_names = ["Lasso", "Lasso-LARS", "Elastic Net", "Ridge"]
+
+model_coefficients = [coefs_lasso.T, coefs_lars.T, coefs_enet.T, coefs_ridge]
+
+model_alphas = [alphas_lasso, alphas_lars, alphas_enet, alphas_enet]
+
+fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(25, 10))
+
+for i in range(len(model_names)):
+    for j in range(len(model_names)):
+        if i == j:
+            axes[i, i].semilogx(model_alphas[i], model_coefficients[i], c="#0072B2")
+
+            axes[i, i].set_title(f"{model_names[i]} Paths", fontsize=14)
+
+        elif j < i:
+            l1 = axes[i, j].semilogx(
+                model_alphas[i], model_coefficients[i], c="#0072B2"
+            )
+
+            l2 = axes[i, j].semilogx(
+                model_alphas[j], model_coefficients[j], linestyle="--", c="#D55E00"
+            )
+
+            axes[i, j].set_title(
+                f"{model_names[j]} vs {model_names[i]} Paths", fontsize=14
+            )
+
+            axes[i, j].legend(
+                (l1[-1], l2[-1]),
+                (f"{model_names[i]}", f"{model_names[j]}"),
+                loc="upper right",
+            )
+
+        else:
+            fig.delaxes(axes[i, j])
+
+fig.text(0.5, 0.02, r"$\alpha$", fontsize=18, ha="center")
+fig.text(0, 0.5, "Coefficients", fontsize=18, va="center", rotation=90)
+
+fig.suptitle(
+    "Comparing Regularization Paths: Lasso(-LARS), Ridge, and Elastic Net", fontsize=20
+)
+
+fig.tight_layout(pad=3.0)
+_ = plt.show()
+
+# %%
+#
+# * In the "Lasso vs Lasso-LARS Paths" visual,
+#   the Lasso and Lasso-LARS paths appear identical towards the end
+#   because both models solve the same constrained problem.
+#   However, Lasso-LARS reaches the solution faster than Lasso.
+#
+# * The "Lasso vs Elastic-Net Paths" visual is more notable.
+#   Elastic Net's coefficients tend to have smaller absolute values than those of Lasso.
+#   Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards
+#   the end.
+#
+# * In the "Lasso(-LARS) vs Ridge Paths" and "Elastic Net vs Ridge Paths" visuals, the
+#   Ridge model focuses on shrinking all coefficients uniformly, rather than setting
+#   some to exactly zero. As a result, the Ridge model retains all features after
+#   training, unlike the Lasso(-LARS) or Elastic Net models.
+#
+# This demonstrates how different regularization techniques govern
+# the model's complexity:
+#
+# 1. the :math:`\ell^1`-norm constraint encourages sparsity in the solution.
+#
+# 2. the :math:`\ell^2`-norm constraint focuses on shrinkage of the magnitude
+#    of the solution.
+#
+# 3. the Elastic Net constraint provides a balanced compromise.
+#
+# Mitigating Overfitting with Regularization
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Recall that the true coefficient `w` refers to the coefficients of the linear model
+# used to generate the training dataset. In this section, we compare the trained
+# coefficients of Lasso(-LARS), Ridge, and Elastic Net with `w` to demonstrate how
+# regularization can mitigate overfitting. This is achieved by computing the
+# :func:`~sklearn.metrics.mean_squared_error` (MSE) between the true and trained
+# coefficients.
+
+lasso_mse, lars_mse, enet_mse, ridge_mse = [], [], [], []
+
+for coef_lasso, coef_enet, coef_ridge in zip(coefs_lasso.T, coefs_enet.T, coefs_ridge):
+    lasso_mse.append(mean_squared_error(coef_lasso, w))
+    enet_mse.append(mean_squared_error(coef_enet, w))
+    ridge_mse.append(mean_squared_error(coef_ridge, w))
+
+for coef_lars in coefs_lars.T:
+    lars_mse.append(mean_squared_error(coef_lars, w))
+
+lasso_mse = np.asarray(lasso_mse)
+lars_mse = np.asarray(lars_mse)
+enet_mse = np.asarray(enet_mse)
+ridge_mse = np.asarray(ridge_mse)
+
+# %%
+#
+# The idea is that a smaller MSE between the true and trained coefficients implies
+# greater similarity between the coefficients. Thus, if the MSE is small, the
+# trained model captures the underlying pattern of the training data well.
+# However, this can also indicate that the trained model may not perform well on
+# generalised data, as the pattern may not hold for unseen data.
+# This is essentially the overfitting problem.
+#
+# The following visualization demonstrates how the MSE changes for different trained
+# models as the regularization parameter :math:`\alpha` increases.
+
+plt.figure()
+l1 = plt.semilogx(alphas_lasso, lasso_mse.T, c="#0072B2")
+l2 = plt.semilogx(alphas_lars, lars_mse.T, c="#D55E00")
+l3 = plt.semilogx(alphas_enet, enet_mse.T, c="#009E73")
+l4 = plt.semilogx(alphas_enet, ridge_mse, c="#F0E442")
+
+plt.xlabel(r"$\alpha$")
+plt.ylabel("Mean Squared Error")
+plt.title("Coefficient Error Across Regularization Strengths")
+plt.legend(
+    (l1[-1], l2[-1], l3[-1], l4[-1]),
+    ("Lasso", "LARS", "Elastic Net", "Ridge"),
+    loc="upper left",
+)
+
+plt.axis("tight")
+_ = plt.show()
+
+# %%
+#
+# In the visualization, for small values of :math:`\alpha`, since our synthetic data is
+# noise-free, the trained coefficients of Lasso(-LARS), Ridge, and Elastic Net are
+# similar to the true coefficient `w` (with MSE close to 0). This indicates that the
+# models capture the intricate details of the training data well.
+#
+# As :math:`\alpha` increases, the MSE also increases. This improves the models' ability
+# to generalise to unseen data (e.g., if the data were noisy), but it also risks
+# degrading model performance if the regularization becomes too strong.
+#
+# Regularization in Logistic Regression
+# -------------------------------------
+#
+# Regularization can also be applied to Logistic Regression when working on
+# classification tasks. scikit-learn's :func:`~sklearn.linear_model.LogisticRegression`
+# enables users to apply regularization using the `penalty` parameter:
+#
+# * `l1`: :math:`\ell^1`-regularization, similar to the Lasso model
+# * `l2`: :math:`\ell^2`-regularization, similar to the Ridge model
+# * `elasticnet`: Combined with the `l1_ratio` parameter for a mix of :math:`\ell^1`
+#   and :math:`\ell^2`
+#
+# Additionally, the `C` parameter controls the inverse of the regularization strength.
+# Smaller values of `C` apply stronger regularization.
+#
+# We demonstrate the effect of regularization by creating a synthetic classification
+# dataset.
+#
+
+X, y = make_classification(
+    n_samples=400,
+    n_features=64,
+    n_informative=64,
+    n_redundant=0,
+    n_classes=2,
+    n_clusters_per_class=1,
+    random_state=42,
+)
+
+# %%
+#
+# In this synthetic binary classification dataset, there are 400 samples,
+# each with 64 features. This toy dataset is noise-free to maintain consistency with
+# our earlier regression example.
+#
+# As noted in the regression example, :math:`\ell^1`-regularization may set some
+# coefficients exactly to zero. For extreme values of `C`, the trained coefficients
+# may even become the zero vector. To address this, scikit-learn provides the
+# :func:`~sklearn.svm.l1_min_c` function, which computes the minimum value of the
+# regularization strength `C` at which the model begins to learn meaningful patterns
+# (i.e., some coefficients become non-zero).
+#
+
+cs = l1_min_c(X, y, loss="log") * np.logspace(0, 10, 16)
+
+# %%
+#
+# We now plot heatmaps to represent the sparsity for each `penalty` and each value
+# of `C`.
+#
+
+l1_ratio = 0.5  # l1 weight in the Elastic-Net regularization
+
+fig, axes = plt.subplots(3, 3)
+
+# Set regularization parameter
+for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)):
+    # Increase tolerance for short training time
+    clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga")
+    clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga")
+    clf_en_LR = LogisticRegression(
+        C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01
+    )
+    clf_l1_LR.fit(X, y)
+    clf_l2_LR.fit(X, y)
+    clf_en_LR.fit(X, y)
+
+    coef_l1_LR = clf_l1_LR.coef_.ravel()
+    coef_l2_LR = clf_l2_LR.coef_.ravel()
+    coef_en_LR = clf_en_LR.coef_.ravel()
+
+    sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
+    sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100
+    sparsity_en_LR = np.mean(coef_en_LR == 0) * 100
+
+    if i == 0:
+        axes_row[0].set_title(r"$\ell^1$ penalty")
+        axes_row[1].set_title(f"Elastic-Net\n {l1_ratio = }")
+        axes_row[2].set_title(r"$\ell^2$ penalty")
+
+    for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]):
+        ax.imshow(
+            np.abs(coefs.reshape(8, 8)),
+            interpolation="nearest",
+            cmap="binary",
+            vmax=1,
+            vmin=0,
+        )
+        ax.set_xticks(())
+        ax.set_yticks(())
+
+    axes_row[0].set_ylabel(f"C = {C:.2f}")
+
+_ = plt.show()
+# %%
+#
+# Each heatmap organizes the 64 coefficients (the number of features in our synthetic
+# classification dataset) into an 8×8 grid. It is constructed by taking the absolute
+# values of the coefficients and displaying them in a black-and-white scale, where
+# lower values appear white and higher values appear black.
+#
+# We can see that larger values of `C` (i.e., weaker regularization) give the model
+# more freedom, while smaller values of `C` impose stronger constraints, leading to
+# increased sparsity. As expected, the Elastic-Net penalty results in a level of
+# sparsity between that of :math:`\ell^1` and :math:`\ell^2`.
+#
+# .. rubric:: References
+#
+# .. [1] Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical
+#        Learning: Data Mining, Inference, and Prediction. New York,
+#        NY: Springer New York.

From 8667b58adc6803fa940efdf3fb5fce4b860f0d97 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 3 Mar 2025 19:21:54 -0800
Subject: [PATCH 13/13] added `plot_regularization.py`, updated `doc/conf.py`,
 and deleted multiple examples.

---
 .../plot_lasso_lasso_lars_elasticnet_path.py  | 162 ----------------
 .../plot_logistic_l1_l2_sparsity.py           |  88 ---------
 examples/linear_model/plot_logistic_path.py   | 103 ----------
 examples/linear_model/plot_ridge_coeffs.py    | 181 ------------------
 examples/linear_model/plot_ridge_path.py      |  68 -------
 examples/linear_model/plot_sgd_penalties.py   |  57 ------
 sklearn/linear_model/_coordinate_descent.py   |   8 +-
 7 files changed, 4 insertions(+), 663 deletions(-)
 delete mode 100644 examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
 delete mode 100644 examples/linear_model/plot_logistic_l1_l2_sparsity.py
 delete mode 100644 examples/linear_model/plot_logistic_path.py
 delete mode 100644 examples/linear_model/plot_ridge_coeffs.py
 delete mode 100644 examples/linear_model/plot_ridge_path.py
 delete mode 100644 examples/linear_model/plot_sgd_penalties.py

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
deleted file mode 100644
index d55d2e921dff6..0000000000000
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-"""
-========================================
-Lasso, Lasso-LARS, and Elastic Net paths
-========================================
-
-This example shows how to compute the Lasso,
-Lasso-LARS, and Elastic Net regularization paths. It illustrates the
-relationship between the regularization parameter :math:`\\alpha`
-and the coefficients :math:`w`.
-
-When performing linear regression on a given dataset
-:math:`(X, y)`, regularization terms can be added to
-control the model's complexity.
-Scikit-learn provides the following regularization techniques:
-
-- :func:`~sklearn.linear_model.Lasso`
-- :func:`~sklearn.linear_model.LassoLars`
-- :func:`~sklearn.linear_model.ElasticNet` with `l1_ratio=0.8`
-
-Mathematically, these are formulated by minimising the constrained
-least-squares penalty:
-
-.. math::
-
-    \\min_{w} \\frac{1}{2n_{\\operatorname{sample}}}
-    \\vert \\vert Xw - y \\vert \\vert^2_2 +
-    \\left\\{
-    \\begin{array}{cl}
-    \\alpha \\vert \\vert w \\vert \\vert_1 & \\mbox{Lasso(-LARS)} \\\\
-    \\frac{4\\alpha}{5} \\vert \\vert w \\vert \\vert_1  +
-    \\frac{\\alpha}{10} \\vert \\vert w \\vert \\vert^2_2& \\mbox{Elastic Net} \\\\
-    \\end{array}
-    \\right.
-
-Thus, the Lasso model includes the :math:`\\ell^1`-norm of the regression
-coefficients in the penalty, while the Elastic Net model
-incorporates both :math:`\\ell^1`- and :math:`\\ell^2`-norms.
-
-Any solution to this optimisation problem depends on :math:`\\alpha`.
-For example, in Lasso, a large :math:`\\alpha` forces the least-squares
-penalty to stay small, which in turn keeps the norm
-:math:`\\vert \\vert w \\vert \\vert_1`
-small. Conversely, a smaller :math:`\\alpha` allows the norm
-:math:`\\vert \\vert w \\vert \\vert_1`
-to grow larger.
-
-This suggests that the regression coefficients :math:`w` evolve as
-:math:`\\alpha` increases, and we are interested in knowing
-:math:`w` across a range of :math:`\\alpha` values. This is known
-as the **regularization path**: a list of :math:`w` values corresponding to
-different :math:`\\alpha` values, ranging from small to large.
-In this example, we plot the regularization paths to show how the sizes of the
-coefficients change as the regularization parameter increases.
-"""
-# %%
-import matplotlib.pyplot as plt
-
-from sklearn.datasets import load_diabetes
-from sklearn.linear_model import enet_path, lars_path, lasso_path
-
-# %%
-# The Diabetes Dataset
-# --------------------
-#
-# We use the :func:`diabetes dataset <sklearn.datasets.load_diabetes>` to plot
-# the regression coefficients for Lasso and Elastic Net.
-
-X, y = load_diabetes(return_X_y=True)
-X /= X.std(axis=0)  # Standardize data (easier to set the l1_ratio parameter)
-
-# %%
-# Scikit-learn provides the following functions to compute multiple
-# :math:`w` values for various :math:`\alpha` values efficiently:
-#
-# - :func:`~sklearn.linear_model.lasso_path`
-# - :func:`~sklearn.linear_model.lars_path`
-# - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.8`
-#
-
-eps = 5e-3  # the smaller it is the longer is the path
-
-alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps)
-
-alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.8)
-
-# %%
-# The :func:`~sklearn.linear_model.lasso_path` and
-# :func:`~sklearn.linear_model.enet_path` functions compute
-# :math:`w` with coordinate decent: for each entry of :math:`w`,
-# the function solves for it optimal value while keeping the others
-# fixed. Since the algorithm iterates until convergence,
-# Lasso doesn't operate in a fixed number of steps based solely
-# on the dataset's size, which can make it take longer to run.
-# In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps.
-
-alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso")
-
-# %%
-# The Lasso-LARS model uses the Least Angle Regression (LARS) algorithm
-# (see [1]_) to compute the Lasso solution in
-# :math:`\min \left\{
-# n_{\operatorname{sample}}-1,n_{\operatorname{feature}}
-# \right\}`
-# steps. This provides an efficient algorithm for computing the entire Lasso path, and
-# is implemented as :func:`~sklearn.linear_model.LassoLars`
-# and :func:`~sklearn.linear_model.lars_path`.
-#
-# We now present the visualisation of the regularization paths for the diabetes dataset.
-# Each model is represented by 10 curves, corresponding to the number of features in the
-# dataset. Each curve shows how a particular coefficient :math:`w_i` changes as
-# :math:`\alpha` increases.
-#
-# Lasso vs Lasso-LARS
-# -------------------
-# In the "Lasso vs LARS Paths" visual,
-
-plt.figure(1)
-for coef_lasso, coef_lars in zip(coefs_lasso, coefs_lars):
-    l1 = plt.semilogx(alphas_lasso, coef_lasso, c="#0072B2")
-    l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c="#D55E00")
-
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Lasso vs LARS Paths")
-plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right")
-plt.axis("tight")
-_ = plt.show()
-
-# %%
-# the Lasso and LARS paths appear identical because both models solve
-# the same constrained problem. However, LARS reaches the solution faster than Lasso.
-#
-# Lasso vs Elastic-Net
-# --------------------
-# The "Lasso vs Elastic-Net Paths" visual is more notable.
-
-plt.figure(2)
-for coef_l, coef_e in zip(coefs_lasso, coefs_enet):
-    l1 = plt.semilogx(alphas_lasso, coef_l, c="#0072B2")
-    l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c="#D55E00")
-
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Lasso vs Elastic-Net Paths")
-plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right")
-plt.axis("tight")
-_ = plt.show()
-
-# %%
-# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso.
-# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards the
-# end. This demonstrates how the :math:`\ell^1`-norm constraint encourages sparsity in
-# the solution, while combining it with the :math:`\ell^2`-norm provides a balanced
-# compromise.
-#
-# .. rubric:: References
-#
-# .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
-#        Friedman J., Algorithm 3.2, p. 74, 2008.
diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
deleted file mode 100644
index f642dfade5db8..0000000000000
--- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""
-==============================================
-L1 Penalty and Sparsity in Logistic Regression
-==============================================
-
-Comparison of the sparsity (percentage of zero coefficients) of solutions when
-L1, L2 and Elastic-Net penalty are used for different values of C. We can see
-that large values of C give more freedom to the model.  Conversely, smaller
-values of C constrain the model more. In the L1 penalty case, this leads to
-sparser solutions. As expected, the Elastic-Net penalty sparsity is between
-that of L1 and L2.
-
-We classify 8x8 images of digits into two classes: 0-4 against 5-9.
-The visualization shows coefficients of the models for varying C.
-
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-from sklearn import datasets
-from sklearn.linear_model import LogisticRegression
-from sklearn.preprocessing import StandardScaler
-
-X, y = datasets.load_digits(return_X_y=True)
-
-X = StandardScaler().fit_transform(X)
-
-# classify small against large digits
-y = (y > 4).astype(int)
-
-l1_ratio = 0.5  # L1 weight in the Elastic-Net regularization
-
-fig, axes = plt.subplots(3, 3)
-
-# Set regularization parameter
-for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)):
-    # Increase tolerance for short training time
-    clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga")
-    clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga")
-    clf_en_LR = LogisticRegression(
-        C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01
-    )
-    clf_l1_LR.fit(X, y)
-    clf_l2_LR.fit(X, y)
-    clf_en_LR.fit(X, y)
-
-    coef_l1_LR = clf_l1_LR.coef_.ravel()
-    coef_l2_LR = clf_l2_LR.coef_.ravel()
-    coef_en_LR = clf_en_LR.coef_.ravel()
-
-    # coef_l1_LR contains zeros due to the
-    # L1 sparsity inducing norm
-
-    sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
-    sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100
-    sparsity_en_LR = np.mean(coef_en_LR == 0) * 100
-
-    print(f"C={C:.2f}")
-    print(f"{'Sparsity with L1 penalty:':<40} {sparsity_l1_LR:.2f}%")
-    print(f"{'Sparsity with Elastic-Net penalty:':<40} {sparsity_en_LR:.2f}%")
-    print(f"{'Sparsity with L2 penalty:':<40} {sparsity_l2_LR:.2f}%")
-    print(f"{'Score with L1 penalty:':<40} {clf_l1_LR.score(X, y):.2f}")
-    print(f"{'Score with Elastic-Net penalty:':<40} {clf_en_LR.score(X, y):.2f}")
-    print(f"{'Score with L2 penalty:':<40} {clf_l2_LR.score(X, y):.2f}")
-
-    if i == 0:
-        axes_row[0].set_title("L1 penalty")
-        axes_row[1].set_title("Elastic-Net\nl1_ratio = %s" % l1_ratio)
-        axes_row[2].set_title("L2 penalty")
-
-    for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]):
-        ax.imshow(
-            np.abs(coefs.reshape(8, 8)),
-            interpolation="nearest",
-            cmap="binary",
-            vmax=1,
-            vmin=0,
-        )
-        ax.set_xticks(())
-        ax.set_yticks(())
-
-    axes_row[0].set_ylabel(f"C = {C}")
-
-plt.show()
diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
deleted file mode 100644
index 46608f683740e..0000000000000
--- a/examples/linear_model/plot_logistic_path.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""
-==============================================
-Regularization path of L1- Logistic Regression
-==============================================
-
-
-Train l1-penalized logistic regression models on a binary classification
-problem derived from the Iris dataset.
-
-The models are ordered from strongest regularized to least regularized. The 4
-coefficients of the models are collected and plotted as a "regularization
-path": on the left-hand side of the figure (strong regularizers), all the
-coefficients are exactly 0. When regularization gets progressively looser,
-coefficients can get non-zero values one after the other.
-
-Here we choose the liblinear solver because it can efficiently optimize for the
-Logistic Regression loss with a non-smooth, sparsity inducing l1 penalty.
-
-Also note that we set a low value for the tolerance to make sure that the model
-has converged before collecting the coefficients.
-
-We also use warm_start=True which means that the coefficients of the models are
-reused to initialize the next model fit to speed-up the computation of the
-full-path.
-
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-# %%
-# Load data
-# ---------
-
-from sklearn import datasets
-
-iris = datasets.load_iris()
-X = iris.data
-y = iris.target
-feature_names = iris.feature_names
-
-# %%
-# Here we remove the third class to make the problem a binary classification
-X = X[y != 2]
-y = y[y != 2]
-
-# %%
-# Compute regularization path
-# ---------------------------
-
-import numpy as np
-
-from sklearn.linear_model import LogisticRegression
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
-from sklearn.svm import l1_min_c
-
-cs = l1_min_c(X, y, loss="log") * np.logspace(0, 1, 16)
-
-# %%
-# Create a pipeline with `StandardScaler` and `LogisticRegression`, to normalize
-# the data before fitting a linear model, in order to speed-up convergence and
-# make the coefficients comparable. Also, as a side effect, since the data is now
-# centered around 0, we don't need to fit an intercept.
-clf = make_pipeline(
-    StandardScaler(),
-    LogisticRegression(
-        penalty="l1",
-        solver="liblinear",
-        tol=1e-6,
-        max_iter=int(1e6),
-        warm_start=True,
-        fit_intercept=False,
-    ),
-)
-coefs_ = []
-for c in cs:
-    clf.set_params(logisticregression__C=c)
-    clf.fit(X, y)
-    coefs_.append(clf["logisticregression"].coef_.ravel().copy())
-
-coefs_ = np.array(coefs_)
-
-# %%
-# Plot regularization path
-# ------------------------
-
-import matplotlib.pyplot as plt
-
-# Colorblind-friendly palette (IBM Color Blind Safe palette)
-colors = ["#648FFF", "#785EF0", "#DC267F", "#FE6100"]
-
-plt.figure(figsize=(10, 6))
-for i in range(coefs_.shape[1]):
-    plt.semilogx(cs, coefs_[:, i], marker="o", color=colors[i], label=feature_names[i])
-
-ymin, ymax = plt.ylim()
-plt.xlabel("C")
-plt.ylabel("Coefficients")
-plt.title("Logistic Regression Path")
-plt.legend()
-plt.axis("tight")
-plt.show()
diff --git a/examples/linear_model/plot_ridge_coeffs.py b/examples/linear_model/plot_ridge_coeffs.py
deleted file mode 100644
index 1ad7962f8bfa3..0000000000000
--- a/examples/linear_model/plot_ridge_coeffs.py
+++ /dev/null
@@ -1,181 +0,0 @@
-"""
-=========================================================
-Ridge coefficients as a function of the L2 Regularization
-=========================================================
-
-A model that overfits learns the training data too well, capturing both the
-underlying patterns and the noise in the data. However, when applied to unseen
-data, the learned associations may not hold. We normally detect this when we
-apply our trained predictions to the test data and see the statistical
-performance drop significantly compared to the training data.
-
-One way to overcome overfitting is through regularization, which can be done by
-penalizing large weights (coefficients) in linear models, forcing the model to
-shrink all coefficients. Regularization reduces a model's reliance on specific
-information obtained from the training samples.
-
-This example illustrates how L2 regularization in a
-:class:`~sklearn.linear_model.Ridge` regression affects a model's performance by
-adding a penalty term to the loss that increases with the coefficients
-:math:`\\beta`.
-
-The regularized loss function is given by: :math:`\\mathcal{L}(X, y, \\beta) =
-\\| y - X \\beta \\|^{2}_{2} + \\alpha \\| \\beta \\|^{2}_{2}`
-
-where :math:`X` is the input data, :math:`y` is the target variable,
-:math:`\\beta` is the vector of coefficients associated with the features, and
-:math:`\\alpha` is the regularization strength.
-
-The regularized loss function aims to balance the trade-off between accurately
-predicting the training set and to prevent overfitting.
-
-In this regularized loss, the left-hand side (e.g. :math:`\\|y -
-X\\beta\\|^{2}_{2}`) measures the squared difference between the actual target
-variable, :math:`y`, and the predicted values. Minimizing this term alone could
-lead to overfitting, as the model may become too complex and sensitive to noise
-in the training data.
-
-To address overfitting, Ridge regularization adds a constraint, called a penalty
-term, (:math:`\\alpha \\| \\beta\\|^{2}_{2}`) to the loss function. This penalty
-term is the sum of the squares of the model's coefficients, multiplied by the
-regularization strength :math:`\\alpha`. By introducing this constraint, Ridge
-regularization discourages any single coefficient :math:`\\beta_{i}` from taking
-an excessively large value and encourages smaller and more evenly distributed
-coefficients. Higher values of :math:`\\alpha` force the coefficients towards
-zero. However, an excessively high :math:`\\alpha` can result in an underfit
-model that fails to capture important patterns in the data.
-
-Therefore, the regularized loss function combines the prediction accuracy term
-and the penalty term. By adjusting the regularization strength, practitioners
-can fine-tune the degree of constraint imposed on the weights, training a model
-capable of generalizing well to unseen data while avoiding overfitting.
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-# %%
-# Purpose of this example
-# -----------------------
-# For the purpose of showing how Ridge regularization works, we will create a
-# non-noisy data set. Then we will train a regularized model on a range of
-# regularization strengths (:math:`\alpha`) and plot how the trained
-# coefficients and the mean squared error between those and the original values
-# behave as functions of the regularization strength.
-#
-# Creating a non-noisy data set
-# *****************************
-# We make a toy data set with 100 samples and 10 features, that's suitable to
-# detect regression. Out of the 10 features, 8 are informative and contribute to
-# the regression, while the remaining 2 features do not have any effect on the
-# target variable (their true coefficients are 0). Please note that in this
-# example the data is non-noisy, hence we can expect our regression model to
-# recover exactly the true coefficients w.
-from sklearn.datasets import make_regression
-
-X, y, w = make_regression(
-    n_samples=100, n_features=10, n_informative=8, coef=True, random_state=1
-)
-
-# Obtain the true coefficients
-print(f"The true coefficient of this regression problem are:\n{w}")
-
-# %%
-# Training the Ridge Regressor
-# ****************************
-# We use :class:`~sklearn.linear_model.Ridge`, a linear model with L2
-# regularization. We train several models, each with a different value for the
-# model parameter `alpha`, which is a positive constant that multiplies the
-# penalty term, controlling the regularization strength. For each trained model
-# we then compute the error between the true coefficients `w` and the
-# coefficients found by the model `clf`. We store the identified coefficients
-# and the calculated errors for the corresponding coefficients in lists, which
-# makes it convenient for us to plot them.
-import numpy as np
-
-from sklearn.linear_model import Ridge
-from sklearn.metrics import mean_squared_error
-
-clf = Ridge()
-
-# Generate values for `alpha` that are evenly distributed on a logarithmic scale
-alphas = np.logspace(-3, 4, 200)
-coefs = []
-errors_coefs = []
-
-# Train the model with different regularisation strengths
-for a in alphas:
-    clf.set_params(alpha=a).fit(X, y)
-    coefs.append(clf.coef_)
-    errors_coefs.append(mean_squared_error(clf.coef_, w))
-
-# %%
-# Plotting trained Coefficients and Mean Squared Errors
-# *****************************************************
-# We now plot the 10 different regularized coefficients as a function of the
-# regularization parameter `alpha` where each color represents a different
-# coefficient.
-#
-# On the right-hand-side, we plot how the errors of the coefficients from the
-# estimator change as a function of regularization.
-import matplotlib.pyplot as plt
-import pandas as pd
-
-alphas = pd.Index(alphas, name="alpha")
-coefs = pd.DataFrame(coefs, index=alphas, columns=[f"Feature {i}" for i in range(10)])
-errors = pd.Series(errors_coefs, index=alphas, name="Mean squared error")
-
-fig, axs = plt.subplots(1, 2, figsize=(20, 6))
-
-coefs.plot(
-    ax=axs[0],
-    logx=True,
-    title="Ridge coefficients as a function of the regularization strength",
-)
-axs[0].set_ylabel("Ridge coefficient values")
-errors.plot(
-    ax=axs[1],
-    logx=True,
-    title="Coefficient error as a function of the regularization strength",
-)
-_ = axs[1].set_ylabel("Mean squared error")
-# %%
-# Interpreting the plots
-# **********************
-# The plot on the left-hand side shows how the regularization strength (`alpha`)
-# affects the Ridge regression coefficients. Smaller values of `alpha` (weak
-# regularization), allow the coefficients to closely resemble the true
-# coefficients (`w`) used to generate the data set. This is because no
-# additional noise was added to our artificial data set. As `alpha` increases,
-# the coefficients shrink towards zero, gradually reducing the impact of the
-# features that were formerly more significant.
-#
-# The right-hand side plot shows the mean squared error (MSE) between the
-# coefficients found by the model and the true coefficients (`w`). It provides a
-# measure that relates to how exact our ridge model is in comparison to the true
-# generative model. A low error means that it found coefficients closer to the
-# ones of the true generative model. In this case, since our toy data set was
-# non-noisy, we can see that the least regularized model retrieves coefficients
-# closest to the true coefficients (`w`) (error is close to 0).
-#
-# When `alpha` is small, the model captures the intricate details of the
-# training data, whether those were caused by noise or by actual information. As
-# `alpha` increases, the highest coefficients shrink more rapidly, rendering
-# their corresponding features less influential in the training process. This
-# can enhance a model's ability to generalize to unseen data (if there was a lot
-# of noise to capture), but it also poses the risk of losing performance if the
-# regularization becomes too strong compared to the amount of noise the data
-# contained (as in this example).
-#
-# In real-world scenarios where data typically includes noise, selecting an
-# appropriate `alpha` value becomes crucial in striking a balance between an
-# overfitting and an underfitting model.
-#
-# Here, we saw that :class:`~sklearn.linear_model.Ridge` adds a penalty to the
-# coefficients to fight overfitting. Another problem that occurs is linked to
-# the presence of outliers in the training dataset. An outlier is a data point
-# that differs significantly from other observations. Concretely, these outliers
-# impact the left-hand side term of the loss function that we showed earlier.
-# Some other linear models are formulated to be robust to outliers such as the
-# :class:`~sklearn.linear_model.HuberRegressor`. You can learn more about it in
-# the :ref:`sphx_glr_auto_examples_linear_model_plot_huber_vs_ridge.py` example.
diff --git a/examples/linear_model/plot_ridge_path.py b/examples/linear_model/plot_ridge_path.py
deleted file mode 100644
index d3c19acd9e18c..0000000000000
--- a/examples/linear_model/plot_ridge_path.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""
-===========================================================
-Plot Ridge coefficients as a function of the regularization
-===========================================================
-
-Shows the effect of collinearity in the coefficients of an estimator.
-
-.. currentmodule:: sklearn.linear_model
-
-:class:`Ridge` Regression is the estimator used in this example.
-Each color represents a different feature of the
-coefficient vector, and this is displayed as a function of the
-regularization parameter.
-
-This example also shows the usefulness of applying Ridge regression
-to highly ill-conditioned matrices. For such matrices, a slight
-change in the target variable can cause huge variances in the
-calculated weights. In such cases, it is useful to set a certain
-regularization (alpha) to reduce this variation (noise).
-
-When alpha is very large, the regularization effect dominates the
-squared loss function and the coefficients tend to zero.
-At the end of the path, as alpha tends toward zero
-and the solution tends towards the ordinary least squares, coefficients
-exhibit big oscillations. In practise it is necessary to tune alpha
-in such a way that a balance is maintained between both.
-
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-from sklearn import linear_model
-
-# X is the 10x10 Hilbert matrix
-X = 1.0 / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
-y = np.ones(10)
-
-# %%
-# Compute paths
-# -------------
-
-n_alphas = 200
-alphas = np.logspace(-10, -2, n_alphas)
-
-coefs = []
-for a in alphas:
-    ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
-    ridge.fit(X, y)
-    coefs.append(ridge.coef_)
-
-# %%
-# Display results
-# ---------------
-
-ax = plt.gca()
-
-ax.plot(alphas, coefs)
-ax.set_xscale("log")
-ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
-plt.xlabel("alpha")
-plt.ylabel("weights")
-plt.title("Ridge coefficients as a function of the regularization")
-plt.axis("tight")
-plt.show()
diff --git a/examples/linear_model/plot_sgd_penalties.py b/examples/linear_model/plot_sgd_penalties.py
deleted file mode 100644
index 6f8830b52fe7a..0000000000000
--- a/examples/linear_model/plot_sgd_penalties.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""
-==============
-SGD: Penalties
-==============
-
-Contours of where the penalty is equal to 1
-for the three penalties L1, L2 and elastic-net.
-
-All of the above are supported by :class:`~sklearn.linear_model.SGDClassifier`
-and :class:`~sklearn.linear_model.SGDRegressor`.
-
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-l1_color = "navy"
-l2_color = "c"
-elastic_net_color = "darkorange"
-
-line = np.linspace(-1.5, 1.5, 1001)
-xx, yy = np.meshgrid(line, line)
-
-l2 = xx**2 + yy**2
-l1 = np.abs(xx) + np.abs(yy)
-rho = 0.5
-elastic_net = rho * l1 + (1 - rho) * l2
-
-plt.figure(figsize=(10, 10), dpi=100)
-ax = plt.gca()
-
-elastic_net_contour = plt.contour(
-    xx, yy, elastic_net, levels=[1], colors=elastic_net_color
-)
-l2_contour = plt.contour(xx, yy, l2, levels=[1], colors=l2_color)
-l1_contour = plt.contour(xx, yy, l1, levels=[1], colors=l1_color)
-ax.set_aspect("equal")
-ax.spines["left"].set_position("center")
-ax.spines["right"].set_color("none")
-ax.spines["bottom"].set_position("center")
-ax.spines["top"].set_color("none")
-
-plt.clabel(
-    elastic_net_contour,
-    inline=1,
-    fontsize=18,
-    fmt={1.0: "elastic-net"},
-    manual=[(-1, -1)],
-)
-plt.clabel(l2_contour, inline=1, fontsize=18, fmt={1.0: "L2"}, manual=[(-1, -1)])
-plt.clabel(l1_contour, inline=1, fontsize=18, fmt={1.0: "L1"}, manual=[(-1, -1)])
-
-plt.tight_layout()
-plt.show()
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index b98cf08925910..5c43c8b04ec20 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -319,8 +319,8 @@ def lasso_path(
     Notes
     -----
     For an example, see
-    :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
-    <sphx_glr_auto_examples_linear_model_plot_lasso_lasso_lars_elasticnet_path.py>`.
+    :ref:`examples/linear_model/plot_regularization.py
+    <sphx_glr_auto_examples_linear_model_plot_regularization.py>`.
 
     To avoid unnecessary memory duplication the X argument of the fit method
     should be directly passed as a Fortran-contiguous numpy array.
@@ -524,8 +524,8 @@ def enet_path(
     Notes
     -----
     For an example, see
-    :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
-    <sphx_glr_auto_examples_linear_model_plot_lasso_lasso_lars_elasticnet_path.py>`.
+    :ref:`examples/linear_model/plot_regularization.py
+    <sphx_glr_auto_examples_linear_model_plot_regularization.py>`.
 
     Examples
     --------